From a513b5348d97e4dad97f5fe2f99d200b70cd30bf Mon Sep 17 00:00:00 2001 From: Trivernis Date: Sun, 18 Nov 2018 22:20:44 +0100 Subject: [PATCH] Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page --- riddle.py | 2 +- riddle2.py | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 riddle2.py diff --git a/riddle.py b/riddle.py index 4517597..32cb15b 100644 --- a/riddle.py +++ b/riddle.py @@ -101,7 +101,7 @@ async def download_async(url, zfile=None, test=False): print_progress(count, imgcount, prefix="Downloading: ", suffix="Complete") for img in images: print_progress(count+1, imgcount, prefix="Downloading: ", suffix="Complete") - count+=1 + count += 1 if test: continue try: diff --git a/riddle2.py b/riddle2.py new file mode 100644 index 0000000..42ee7ab --- /dev/null +++ b/riddle2.py @@ -0,0 +1,175 @@ +import urllib.request as urlreq + +from bs4 import BeautifulSoup +import zipfile +import time +import os +import sys + +blacklist = ['b.thumbs.redditmedia.com', 'reddit.com'] +dl_dir = './.cache/' +img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for +hdr = { # request header + 'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) + Chrome/23.0.1271.64 Safari/537.11""", + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', + 'Connection': 'keep-alive'} + + +def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'): + percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) + filled_length = int(length * iteration // total) + bar = fill * filled_length + '-' * (length - filled_length) + print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') + # Print New Line on Complete + if iteration == total: + print() + + +def spinning_cursor(): + while True: + for cursor in '|/-\\': + yield cursor + + +def get_extension(fstring): + return fstring.split('.')[-1].lower() + + +def get_soup4url(url): + """ Returns a soup for the url with 10 retrys """ + req = urlreq.Request(url, headers=hdr) + html = None + for x in range(0, 10): + try: + html = urlreq.urlopen(req).read() + break + except Exception as e: + print('[-]', e) + if html: + soup = BeautifulSoup(html, "lxml") + return soup + return False + + +def has_source(tag): + if tag.has_attr('src'): + try: + return get_extension(tag['src']) in img_ext + except IndexError or KeyError: + return False + elif tag.has_attr('data-url'): + try: + tag['src'] = tag['data-url'] + return get_extension(tag['src']) in img_ext + except IndexError or KeyError: + return False + else: + return False + + +def get_next_url(baseurl, url): + ids = [] + soup = get_soup4url(url) + if not soup: + return False + for t in soup.find_all(has_source): + if 'redditmedia' not in t['src']: + try: + fname = t['data-fullname'] + ids.append(fname) + except KeyError: + pass + ids = [_id for _id in ids if _id] + if len(ids) == 0: + return False + _id = ids[-1] + next_url = '{}/?after={}'.format(baseurl, _id) + return next_url + + +def get_img4site(url): + soup = get_soup4url(url) + if not soup: + return False + ret = [] + sys.stdout.write('.') + sys.stdout.flush() + for t in soup.find_all(has_source): + try: + if 'redditmedia' not in t['src']: + img = t['src'] + if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]: + img = url + img + if 'http' not in img.split('/')[0]: + img = 'http:' + img + if img.strip('http://').strip('https://').split('/')[0] in blacklist: + img = None + if img: + ret.append(img) + except KeyError: + pass + return ret + + +def get_img4sub(url, length=-1): + baseurl = url + imgs = [] + print('[ ] 1/2 Getting images...') + if length >= 0: + for x in range(length): + time.sleep(0.1) # we don't want to flood with requests + imgs.extend(get_img4site(url)) + url = get_next_url(baseurl, url) + if not url: + break + sys.stdout.write('\b') + else: + while url: + time.sleep(0.1) # we don't want to flood with requests + imgs.extend(get_img4site(url)) + url = get_next_url(baseurl, url) + return imgs + + +def download_images(imgs, zfile): + count = 0 + imgcount = len(imgs) + print('[ ] Downloading %s images' % imgcount) + if not os.path.isdir(dl_dir): + os.mkdir(dl_dir) + print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete") + for img in imgs: + print_progress(count+1, imgcount, prefix="2/2 Downloading: ", suffix="Complete") + imgname = img.split('/')[-1] + name = dl_dir + imgname + if os.path.isfile(name): + continue + f = open(name, "wb") + req = urlreq.Request(img, headers=hdr) + image = urlreq.urlopen(req) + f.write(image.read()) + f.close() + zfile.write(name, imgname, zipfile.ZIP_DEFLATED) + try: + os.remove(name) + except FileNotFoundError or PermissionError: + pass + time.sleep(0.1) # no don't penetrate + count += 1 + + +def download_subreddit(sub): + mode = 'w' + if os.path.isfile(sub + '.zip'): + mode = 'a' + url = 'https://old.reddit.com/r/%s/' % sub + imgs = get_img4sub(url) + zfile = zipfile.ZipFile('%s.zip' % sub, mode) + download_images(imgs, zfile) + zfile.close() + + +if __name__ == '__main__': + download_subreddit('Animewallpaper')