You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

207 lines
6.6 KiB
Python

import zipfile
import time
import os
import sys
import optparse
import shutil
from lib import cutils, netutils, fsutils
blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
dl_dir = './.cache/'
img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for
hdr = { # request header
'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)
Chrome/23.0.1271.64 Safari/537.11""",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
errors = {}
def has_source(tag: netutils.BeautifulSoup) -> bool:
if tag.has_attr('src'):
try:
return fsutils.get_extension(tag['src']) in img_ext
except IndexError or KeyError:
return False
elif tag.has_attr('data-url'):
try:
tag['src'] = tag['data-url']
return fsutils.get_extension(tag['src']) in img_ext
except IndexError or KeyError:
return False
else:
return False
def get_next_url(baseurl: str, url: str):
ids = []
soup = netutils.get_soup4url(url, headers=hdr)
if not soup:
return False
for t in soup.find_all(has_source):
if 'redditmedia' not in t['src']:
try:
fname = t['data-fullname']
ids.append(fname)
except KeyError:
pass
ids = [_id for _id in ids if _id]
if len(ids) == 0: # if no id was found, we can't get any further into the past
return None
_id = ids[-1]
next_url = '{}/?after={}'.format(baseurl, _id)
return next_url
def get_img4site(url: str) -> list:
soup = netutils.get_soup4url(url, headers=hdr)
if not soup:
return []
ret = []
sys.stdout.write('.')
sys.stdout.flush()
for t in soup.find_all(has_source):
try:
if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
img = t['src']
if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
img = url + img
if 'http' not in img.split('/')[0]:
img = 'http:' + img
if img.strip('http://').strip('https://').split('/')[0] in blacklist:
img = None
if img:
ret.append(img)
except KeyError:
pass
return ret
def get_img4sub(url: str, length: int =-1) -> list:
baseurl = url
imgs = []
print('[~] 1/2 Getting images...')
if length >= 0:
x = 0
while x < length:
time.sleep(0.1) # we don't want to flood with requests
imgurls = get_img4site(url)
if not imgurls:
break
imgs.extend(imgurls)
x = len(imgs)
url = get_next_url(baseurl, url)
if not url:
break
sys.stdout.write('\b')
imgs = imgs[:length]
else:
while url:
time.sleep(0.1) # we don't want to flood with requests
imgurls = get_img4site(url)
if not imgurls:
break
imgs.extend(imgurls)
url = get_next_url(baseurl, url)
print()
print('[+] Found %s images' % len(imgs))
return imgs
def download_images(imgs: list, zfile: zipfile.ZipFile):
imgcount = len(imgs)
fnames = [zinfo.filename for zinfo in zfile.infolist()]
print('[~] Downloading %s images' % imgcount)
pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete")
fsutils.dir_exist_guarantee(dl_dir)
for img in imgs:
pb.tick()
imgname = img.split('/')[-1]
name = os.path.join(dl_dir, imgname)
if os.path.isfile(name) or imgname in fnames:
continue
netutils.download_file(img, name, headers=hdr)
zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
try:
os.remove(name)
except FileNotFoundError or PermissionError:
pass
time.sleep(0.1) # no don't penetrate
added = len(zfile.infolist()) - len(fnames)
print('[+] Added %s files to the zipfile' % added)
def download_subreddit(sub: str, count: int =-1, out: str =None):
mode = 'w'
zname = sub + '.zip'
if out:
zname = out
if os.path.isfile(zname):
mode = 'a'
url = 'https://old.reddit.com/r/%s/' % sub
imgs = get_img4sub(url, length=count)
zfile = zipfile.ZipFile(zname, mode)
download_images(imgs, zfile)
zfile.close()
def cleanup():
print('[~] Cleanup...')
if os.path.isdir(dl_dir):
shutil.rmtree(dl_dir)
def parser_init():
parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
parser.add_option('-c', '--count', dest='count',
type='int', default=-1,
help='The number of images to download.')
parser.add_option('-o', '--output', dest='output',
type='str', default=None,
help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')
parser.add_option('-t', '--test', dest='test',
action='store_true', default=False,
help='Tests the functions of the script')
parser.add_option('-l', '--loop', dest='loop',
action='store_true', default=False,
help="""Continuing download loop. When this option is set every 5 Minutes the program searches for
new images""")
return parser.parse_args()
def download_subreddits(subreddits, count, output):
for sub in subreddits:
print('[~] Downloading %s' % sub)
download_subreddit(sub, count=count, out=output)
print()
def main():
options, subreddits = parser_init()
count = options.count
output = options.output
if options.test:
count = 1
subreddits = ['python']
output = 'test.zip'
if options.loop:
while True:
download_subreddits(subreddits, count, output)
print('[~] Next Download in 5 minues...')
time.sleep(300)
else:
download_subreddits(subreddits, count, output)
cleanup()
if options.test:
os.remove(output)
if len(errors.keys()) > 0:
print('[-] Following errors occured:')
for key in errors.keys():
print(' %s times: %s' % (errors[key], key))
if __name__ == '__main__':
main()