From 32af939716a55fbbf7fb095c024ab72f297217be Mon Sep 17 00:00:00 2001 From: Trivernis <19694973+Trivernis@users.noreply.github.com> Date: Fri, 12 Oct 2018 08:29:49 +0200 Subject: [PATCH 01/15] Added comments to riddle.py Added function descriptions and comments to global variables. --- riddle.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/riddle.py b/riddle.py index 500afd2..4517597 100644 --- a/riddle.py +++ b/riddle.py @@ -8,11 +8,11 @@ import optparse import asyncio import shutil -redditurl: str = 'https://old.reddit.com/r/%s' -dl_dir: str = './.cache/' # Format must be ./ -img_ext: List[str] = ['jpg', 'png', 'bmp'] -blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com'] -hdr: Dict[str, str] = { +redditurl: str = 'https://old.reddit.com/r/%s' # the url for reddit with %s to insert the subreddit name +dl_dir: str = './.cache/' # Format must be ./ # the directory where files are cached. Will be created if it doesn't exist +img_ext: List[str] = ['jpg', 'png', 'bmp'] # file extensions that are images +blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com'] # where images shouldn't be downloaded from +hdr: Dict[str, str] = { # request header 'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11""", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', @@ -20,6 +20,7 @@ hdr: Dict[str, str] = { 'Connection': 'keep-alive'} +# prints a progress bar def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'): percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) filled_length = int(length * iteration // total) @@ -30,6 +31,7 @@ def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=10 print() +# returns a soup for the given url async def request_soup(url): req = urlreq.Request(url, headers=hdr) html = None @@ -44,6 +46,7 @@ async def request_soup(url): return soup +# returns all images for the given url async def get_img_as(url): soup = await request_soup(url) ret = [] @@ -56,6 +59,7 @@ async def get_img_as(url): return ret +# returns the last post id in the given reddit page async def get_next(url): ids = [] soup = await request_soup(url) @@ -69,6 +73,7 @@ async def get_next(url): return [_id for _id in ids if _id][-1] +# returns if the given tag has a source attribute that is an image def has_source(tag): if tag.has_attr('src'): try: @@ -85,6 +90,7 @@ def has_source(tag): return False +# downloads all images for the given url and puts them in a zipfile async def download_async(url, zfile=None, test=False): images = await get_img_as(url) print('[+] Found %s images' % len(images)) @@ -127,6 +133,7 @@ async def download_async(url, zfile=None, test=False): print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url)) +# loops over reddit-pages until no more images are found async def dl_loop(section, zfile, loop, chaos=False, test=False): baseurl = redditurl % section url = baseurl @@ -151,6 +158,7 @@ async def dl_loop(section, zfile, loop, chaos=False, test=False): await asyncio.sleep(0.1) +# the main function def main(sections, opts): chaos = opts.chaos if not os.path.exists(dl_dir): From a513b5348d97e4dad97f5fe2f99d200b70cd30bf Mon Sep 17 00:00:00 2001 From: Trivernis Date: Sun, 18 Nov 2018 22:20:44 +0100 Subject: [PATCH 02/15] Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page --- riddle.py | 2 +- riddle2.py | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 riddle2.py diff --git a/riddle.py b/riddle.py index 4517597..32cb15b 100644 --- a/riddle.py +++ b/riddle.py @@ -101,7 +101,7 @@ async def download_async(url, zfile=None, test=False): print_progress(count, imgcount, prefix="Downloading: ", suffix="Complete") for img in images: print_progress(count+1, imgcount, prefix="Downloading: ", suffix="Complete") - count+=1 + count += 1 if test: continue try: diff --git a/riddle2.py b/riddle2.py new file mode 100644 index 0000000..42ee7ab --- /dev/null +++ b/riddle2.py @@ -0,0 +1,175 @@ +import urllib.request as urlreq + +from bs4 import BeautifulSoup +import zipfile +import time +import os +import sys + +blacklist = ['b.thumbs.redditmedia.com', 'reddit.com'] +dl_dir = './.cache/' +img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for +hdr = { # request header + 'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) + Chrome/23.0.1271.64 Safari/537.11""", + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', + 'Connection': 'keep-alive'} + + +def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'): + percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) + filled_length = int(length * iteration // total) + bar = fill * filled_length + '-' * (length - filled_length) + print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') + # Print New Line on Complete + if iteration == total: + print() + + +def spinning_cursor(): + while True: + for cursor in '|/-\\': + yield cursor + + +def get_extension(fstring): + return fstring.split('.')[-1].lower() + + +def get_soup4url(url): + """ Returns a soup for the url with 10 retrys """ + req = urlreq.Request(url, headers=hdr) + html = None + for x in range(0, 10): + try: + html = urlreq.urlopen(req).read() + break + except Exception as e: + print('[-]', e) + if html: + soup = BeautifulSoup(html, "lxml") + return soup + return False + + +def has_source(tag): + if tag.has_attr('src'): + try: + return get_extension(tag['src']) in img_ext + except IndexError or KeyError: + return False + elif tag.has_attr('data-url'): + try: + tag['src'] = tag['data-url'] + return get_extension(tag['src']) in img_ext + except IndexError or KeyError: + return False + else: + return False + + +def get_next_url(baseurl, url): + ids = [] + soup = get_soup4url(url) + if not soup: + return False + for t in soup.find_all(has_source): + if 'redditmedia' not in t['src']: + try: + fname = t['data-fullname'] + ids.append(fname) + except KeyError: + pass + ids = [_id for _id in ids if _id] + if len(ids) == 0: + return False + _id = ids[-1] + next_url = '{}/?after={}'.format(baseurl, _id) + return next_url + + +def get_img4site(url): + soup = get_soup4url(url) + if not soup: + return False + ret = [] + sys.stdout.write('.') + sys.stdout.flush() + for t in soup.find_all(has_source): + try: + if 'redditmedia' not in t['src']: + img = t['src'] + if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]: + img = url + img + if 'http' not in img.split('/')[0]: + img = 'http:' + img + if img.strip('http://').strip('https://').split('/')[0] in blacklist: + img = None + if img: + ret.append(img) + except KeyError: + pass + return ret + + +def get_img4sub(url, length=-1): + baseurl = url + imgs = [] + print('[ ] 1/2 Getting images...') + if length >= 0: + for x in range(length): + time.sleep(0.1) # we don't want to flood with requests + imgs.extend(get_img4site(url)) + url = get_next_url(baseurl, url) + if not url: + break + sys.stdout.write('\b') + else: + while url: + time.sleep(0.1) # we don't want to flood with requests + imgs.extend(get_img4site(url)) + url = get_next_url(baseurl, url) + return imgs + + +def download_images(imgs, zfile): + count = 0 + imgcount = len(imgs) + print('[ ] Downloading %s images' % imgcount) + if not os.path.isdir(dl_dir): + os.mkdir(dl_dir) + print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete") + for img in imgs: + print_progress(count+1, imgcount, prefix="2/2 Downloading: ", suffix="Complete") + imgname = img.split('/')[-1] + name = dl_dir + imgname + if os.path.isfile(name): + continue + f = open(name, "wb") + req = urlreq.Request(img, headers=hdr) + image = urlreq.urlopen(req) + f.write(image.read()) + f.close() + zfile.write(name, imgname, zipfile.ZIP_DEFLATED) + try: + os.remove(name) + except FileNotFoundError or PermissionError: + pass + time.sleep(0.1) # no don't penetrate + count += 1 + + +def download_subreddit(sub): + mode = 'w' + if os.path.isfile(sub + '.zip'): + mode = 'a' + url = 'https://old.reddit.com/r/%s/' % sub + imgs = get_img4sub(url) + zfile = zipfile.ZipFile('%s.zip' % sub, mode) + download_images(imgs, zfile) + zfile.close() + + +if __name__ == '__main__': + download_subreddit('Animewallpaper') From feaca33300a127b461b10b198be8d4dec15fd796 Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 19 Nov 2018 10:33:52 +0100 Subject: [PATCH 03/15] Added full functionalities to riddle2.py - -c for number of images to download --- riddle2.py | 87 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 15 deletions(-) diff --git a/riddle2.py b/riddle2.py index 42ee7ab..45f7802 100644 --- a/riddle2.py +++ b/riddle2.py @@ -1,10 +1,13 @@ +# encoding=utf-8 import urllib.request as urlreq - from bs4 import BeautifulSoup + import zipfile import time import os import sys +import optparse +import shutil blacklist = ['b.thumbs.redditmedia.com', 'reddit.com'] dl_dir = './.cache/' @@ -15,6 +18,7 @@ hdr = { # request header 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} +errors = {} def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'): @@ -46,8 +50,11 @@ def get_soup4url(url): html = urlreq.urlopen(req).read() break except Exception as e: - print('[-]', e) - if html: + if errors[e]: + errors[e] += 1 + else: + errors[e] = 1 + if html: soup = BeautifulSoup(html, "lxml") return soup return False @@ -98,7 +105,7 @@ def get_img4site(url): sys.stdout.flush() for t in soup.find_all(has_source): try: - if 'redditmedia' not in t['src']: + if 'redditmedia' not in t['src'] and 'icon' not in t['src']: img = t['src'] if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]: img = url + img @@ -118,37 +125,52 @@ def get_img4sub(url, length=-1): imgs = [] print('[ ] 1/2 Getting images...') if length >= 0: - for x in range(length): + x = 0 + while x < length: time.sleep(0.1) # we don't want to flood with requests - imgs.extend(get_img4site(url)) + imgurls = get_img4site(url) + if not imgurls: + break + imgs.extend(imgurls) + x = len(imgs) url = get_next_url(baseurl, url) if not url: break sys.stdout.write('\b') + imgs = imgs[:length] else: while url: time.sleep(0.1) # we don't want to flood with requests - imgs.extend(get_img4site(url)) + imgurls = get_img4site(url) + if not imgurls: + break + imgs.extend(imgurls) url = get_next_url(baseurl, url) + print('[+] Found %s images' % len(imgs)) return imgs def download_images(imgs, zfile): - count = 0 + count = 1 imgcount = len(imgs) + fnames = [zinfo.filename for zinfo in zfile.infolist()] print('[ ] Downloading %s images' % imgcount) if not os.path.isdir(dl_dir): os.mkdir(dl_dir) - print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete") for img in imgs: - print_progress(count+1, imgcount, prefix="2/2 Downloading: ", suffix="Complete") + print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete") imgname = img.split('/')[-1] name = dl_dir + imgname - if os.path.isfile(name): + if os.path.isfile(name) or imgname in fnames: + count += 1 continue f = open(name, "wb") req = urlreq.Request(img, headers=hdr) - image = urlreq.urlopen(req) + try: + image = urlreq.urlopen(req) + except ConnectionError: + print('\n [-] Connection Error') + return f.write(image.read()) f.close() zfile.write(name, imgname, zipfile.ZIP_DEFLATED) @@ -158,18 +180,53 @@ def download_images(imgs, zfile): pass time.sleep(0.1) # no don't penetrate count += 1 + added = len(zfile.infolist()) - len(fnames) + print('[+] Added %s files to the zipfile' % added) -def download_subreddit(sub): +def download_subreddit(sub, count=-1): mode = 'w' if os.path.isfile(sub + '.zip'): mode = 'a' url = 'https://old.reddit.com/r/%s/' % sub - imgs = get_img4sub(url) + imgs = get_img4sub(url, length=count) zfile = zipfile.ZipFile('%s.zip' % sub, mode) download_images(imgs, zfile) zfile.close() +def cleanup(): + print('[ ] Cleanup...') + if os.path.isdir(dl_dir): + shutil.rmtree(dl_dir) + + +def parser_init(): + parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]") + parser.add_option('-c', '--count', dest='count', + type='int', default=-1, + help='The number of images to download.') + parser.add_option('-t', '--test', dest='test', + action='store_true', default=False, + help='Tests the functions of the script') + return parser.parse_args() + + +def main(): + options, subreddits = parser_init() + if options.count: + count = options.count + else: + count = -1 + if options.test: + count = 1 + subreddits = ['python'] + for sub in subreddits: + print('[ ] Downloading %s' % sub) + download_subreddit(sub, count=count) + cleanup() + print(errors) + + if __name__ == '__main__': - download_subreddit('Animewallpaper') + main() From c78d4f075a3295cecffee3669a18760ec7b24d9e Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 19 Nov 2018 10:51:02 +0100 Subject: [PATCH 04/15] Changes in riddle2 - Non-result prints are now [~] - Waiting 1 second before connection retry --- riddle2.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/riddle2.py b/riddle2.py index 45f7802..cfd2b3a 100644 --- a/riddle2.py +++ b/riddle2.py @@ -54,6 +54,7 @@ def get_soup4url(url): errors[e] += 1 else: errors[e] = 1 + time.sleep(1) # to avoid request flooding if html: soup = BeautifulSoup(html, "lxml") return soup @@ -123,7 +124,7 @@ def get_img4site(url): def get_img4sub(url, length=-1): baseurl = url imgs = [] - print('[ ] 1/2 Getting images...') + print('[~] 1/2 Getting images...') if length >= 0: x = 0 while x < length: @@ -154,7 +155,7 @@ def download_images(imgs, zfile): count = 1 imgcount = len(imgs) fnames = [zinfo.filename for zinfo in zfile.infolist()] - print('[ ] Downloading %s images' % imgcount) + print('[~] Downloading %s images' % imgcount) if not os.path.isdir(dl_dir): os.mkdir(dl_dir) for img in imgs: @@ -196,7 +197,7 @@ def download_subreddit(sub, count=-1): def cleanup(): - print('[ ] Cleanup...') + print('[~] Cleanup...') if os.path.isdir(dl_dir): shutil.rmtree(dl_dir) @@ -222,7 +223,7 @@ def main(): count = 1 subreddits = ['python'] for sub in subreddits: - print('[ ] Downloading %s' % sub) + print('[~] Downloading %s' % sub) download_subreddit(sub, count=count) cleanup() print(errors) From 633f84202a883508034cae8c3528b4dcfe6dddbc Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 19 Nov 2018 10:56:17 +0100 Subject: [PATCH 05/15] Added test config for riddle2 --- .circleci/config.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 40a6e43..c56a9c4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,11 +45,17 @@ jobs: # https://pytest.org # https://nose.readthedocs.io - run: - name: run tests + name: run tests for riddle.py command: | . venv/bin/activate python riddle.py -t Python + - run: + name: run tests for riddle2.py + command: | + . venv/bin/activate + python riddle2.py + - store_artifacts: path: test-reports destination: test-reports From bfec3157a5799742184f65ee9b47e2a9de0c6d7c Mon Sep 17 00:00:00 2001 From: Trivernis Date: Mon, 19 Nov 2018 13:03:06 +0100 Subject: [PATCH 06/15] Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile --- riddle2.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/riddle2.py b/riddle2.py index cfd2b3a..0f02ebc 100644 --- a/riddle2.py +++ b/riddle2.py @@ -21,11 +21,12 @@ hdr = { # request header errors = {} -def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'): +def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'): percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) filled_length = int(length * iteration // total) bar = fill * filled_length + '-' * (length - filled_length) print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') + sys.stdout.flush() # Print New Line on Complete if iteration == total: print() @@ -185,13 +186,16 @@ def download_images(imgs, zfile): print('[+] Added %s files to the zipfile' % added) -def download_subreddit(sub, count=-1): +def download_subreddit(sub, count=-1, out=None): mode = 'w' - if os.path.isfile(sub + '.zip'): + zname = sub + '.zip' + if out: + zname = out + if os.path.isfile(zname): mode = 'a' url = 'https://old.reddit.com/r/%s/' % sub imgs = get_img4sub(url, length=count) - zfile = zipfile.ZipFile('%s.zip' % sub, mode) + zfile = zipfile.ZipFile(zname, mode) download_images(imgs, zfile) zfile.close() @@ -207,6 +211,9 @@ def parser_init(): parser.add_option('-c', '--count', dest='count', type='int', default=-1, help='The number of images to download.') + parser.add_option('-o', '--output', dest='output', + type='str', default=None, + help='The name of the output zipfile. If none is specified, it\'s the subreddits name.') parser.add_option('-t', '--test', dest='test', action='store_true', default=False, help='Tests the functions of the script') @@ -215,18 +222,20 @@ def parser_init(): def main(): options, subreddits = parser_init() - if options.count: - count = options.count - else: - count = -1 + count = options.count + output = options.output if options.test: count = 1 subreddits = ['python'] + output = 'test.zip' for sub in subreddits: print('[~] Downloading %s' % sub) - download_subreddit(sub, count=count) + download_subreddit(sub, count=count, out=output) cleanup() - print(errors) + if options.test: + os.remove(output) + if len(errors.keys()) > 0: + print(errors) if __name__ == '__main__': From c3ea1be7f6ca97a2b8331d6a7ffc95e91075d102 Mon Sep 17 00:00:00 2001 From: Trivernis Date: Mon, 19 Nov 2018 13:03:58 +0100 Subject: [PATCH 07/15] Modified CI config --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c56a9c4..7c4d42b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: name: run tests for riddle2.py command: | . venv/bin/activate - python riddle2.py + python riddle2.py -t - store_artifacts: path: test-reports From 11d8635b4a71a3e147ca836bc5568a577f454fd3 Mon Sep 17 00:00:00 2001 From: Trivernis Date: Mon, 19 Nov 2018 13:05:44 +0100 Subject: [PATCH 08/15] Changes in riddle2.py - Added blank line after image searching --- riddle2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/riddle2.py b/riddle2.py index 0f02ebc..f583c78 100644 --- a/riddle2.py +++ b/riddle2.py @@ -183,6 +183,7 @@ def download_images(imgs, zfile): time.sleep(0.1) # no don't penetrate count += 1 added = len(zfile.infolist()) - len(fnames) + print() print('[+] Added %s files to the zipfile' % added) From a209475deaccae67d1b347b9a7ebe55ef8d003fd Mon Sep 17 00:00:00 2001 From: Trivernis Date: Mon, 19 Nov 2018 13:06:59 +0100 Subject: [PATCH 09/15] Changes in riddle2.py - fixed error --- riddle2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/riddle2.py b/riddle2.py index f583c78..1b908f8 100644 --- a/riddle2.py +++ b/riddle2.py @@ -148,6 +148,7 @@ def get_img4sub(url, length=-1): break imgs.extend(imgurls) url = get_next_url(baseurl, url) + print() print('[+] Found %s images' % len(imgs)) return imgs @@ -183,7 +184,6 @@ def download_images(imgs, zfile): time.sleep(0.1) # no don't penetrate count += 1 added = len(zfile.infolist()) - len(fnames) - print() print('[+] Added %s files to the zipfile' % added) From f79a0de8c1fd2a82b39ece29d1d94ed1e27eee59 Mon Sep 17 00:00:00 2001 From: Julius Date: Tue, 20 Nov 2018 19:08:36 +0100 Subject: [PATCH 10/15] Modified project in general - Exported some functionalities into libraries --- .gitignore | 4 +- conf/logging.config | 27 +++++++++++++ lib/cutils.py | 31 +++++++++++++++ lib/fsutils.py | 70 +++++++++++++++++++++++++++++++++ lib/logs/utility.log | 0 lib/logutils.py | 13 +++++++ lib/netutils.py | 37 ++++++++++++++++++ logs/utility.log | 61 +++++++++++++++++++++++++++++ requirements.txt | 1 - riddle2.py | 92 +++++++++----------------------------------- 10 files changed, 261 insertions(+), 75 deletions(-) create mode 100644 conf/logging.config create mode 100644 lib/cutils.py create mode 100644 lib/fsutils.py create mode 100644 lib/logs/utility.log create mode 100644 lib/logutils.py create mode 100644 lib/netutils.py create mode 100644 logs/utility.log diff --git a/.gitignore b/.gitignore index 264ab3d..4c988b5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.zip .idea -.cache \ No newline at end of file +.cache +__pycache__.* +__pycache__ \ No newline at end of file diff --git a/conf/logging.config b/conf/logging.config new file mode 100644 index 0000000..57f0eb5 --- /dev/null +++ b/conf/logging.config @@ -0,0 +1,27 @@ +[loggers] +keys=root + +[handlers] +keys=stream_handler, file_handler + +[formatters] +keys=formatter + +[logger_root] +level=DEBUG +handlers=stream_handler, file_handler + +[handler_stream_handler] +class=StreamHandler +level=FATAL +formatter=formatter +args=(sys.stderr,) + +[handler_file_handler] +class=handlers.TimedRotatingFileHandler +level=DEBUG +formatter=formatter +args=('./logs/utility.log','midnight',1,5,'utf-8',False,True,) + +[formatter_formatter] +format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s \ No newline at end of file diff --git a/lib/cutils.py b/lib/cutils.py new file mode 100644 index 0000000..5235482 --- /dev/null +++ b/lib/cutils.py @@ -0,0 +1,31 @@ + +class ProgressBar: + def __init__(self, total=100, prefix='', suffix='', length=50, fill='█'): + self.prefix = prefix + self.suffix = suffix + self.fill = fill + self.length = length + self.total = total + self.progress = 0 + + def tick(self): + self.progress += 1 + self._print_progress() + + def setprogress(self, progress): + self.progress = progress + self._print_progress() + + def _print_progress(self): + iteration = self.progress + total = self.total + prefix = self.prefix + suffix = self.suffix + + percent = ("{0:." + str(1) + "f}").format(100 * (iteration / float(total))) + filled_length = int(self.length * iteration // total) + bar = self.fill * filled_length + '-' * (self.length - filled_length) + print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') + # Print New Line on Complete + if iteration == total: + print() diff --git a/lib/fsutils.py b/lib/fsutils.py new file mode 100644 index 0000000..4bc84b2 --- /dev/null +++ b/lib/fsutils.py @@ -0,0 +1,70 @@ +import os +import shutil + + +def dir_exist_guarantee(dirpath): + if not os.path.exists(dirpath): + os.mkdir(dirpath) + + +def get_extension(fname: str): + return fname.split('.')[-1] + + +class FileInfo: + """ A simple wrapper around the os path functions that returns basic file info + and let's you peform basic file tasks.""" + def __init__(self, fname: str): + self._init_info(fname) + + def _init_info(self, fname): + """ Set's all the required variables for performing file tasks and to + access when working with the file object. """ + # stringvars + self._path = os.path.normpath(fname.replace('\\', '/')).encode('utf-8') + if not os.path.isfile(self._path): + raise Exception("Not a File") + self._extless, self.extension = os.path.splitext(self._path) + self.dirname, self.basename = os.path.split(self._path) + self.fullname = os.path.join(self.dirname, self.basename) + # boolvars + self.exist = os.path.exists(self.fullname) + self.ismount = self.islink = False + if self.exist: + self.ismount = os.path.ismount(self.fullname) + self.islink = os.path.islink(self.fullname) + + def delete(self): + """ Deletes the file if it exists. + Does nothing, if it does not exist.""" + if self.exist: + os.remove(self.fullname) + + def create(self): + """ Creates the file if it doesn't exist. + Does nothing, if it does.""" + if not self.exist: + with open(self.fullname, 'w') as f: + f.write(''); + + def reset(self): + """ Opens the file and writes nothing into it. """ + with open(self.fullname, 'w') as f: + f.write('') + + def open(self, mode: str): + """ Returns the file opened with the open method. """ + self.create() + return open(self.fullname, mode) + + def copy(self, dest: str): + if self.exist: + shutil.copyfile(self.fullname, dest) + return FileInfo(dest) + + def move(self, dest: str): + if self.exist: + shutil.move(self.fullname, dest) + self._init_info(dest) + else: + self._init_info(dest) diff --git a/lib/logs/utility.log b/lib/logs/utility.log new file mode 100644 index 0000000..e69de29 diff --git a/lib/logutils.py b/lib/logutils.py new file mode 100644 index 0000000..f45a034 --- /dev/null +++ b/lib/logutils.py @@ -0,0 +1,13 @@ +import logging +from logging.config import fileConfig + +from lib import fsutils + + +def get_logger(name=None): + fsutils.dir_exist_guarantee('logs') + fileConfig('./conf/logging.config') + if name: + return logging.getLogger(name) + else: + return logging.getLogger() diff --git a/lib/netutils.py b/lib/netutils.py new file mode 100644 index 0000000..c3d131a --- /dev/null +++ b/lib/netutils.py @@ -0,0 +1,37 @@ +import urllib.request as urlreq +import time + +from bs4 import BeautifulSoup + +from lib import logutils + +logger = logutils.get_logger('netutils') + + +def get_soup4url(url: str, retrys: int =2, headers: dict=urlreq.noheaders(), timeout: int =30) -> BeautifulSoup: + """ Returns a soup for the url """ + req = urlreq.Request(url, headers=headers) + html = None + for _ in range(0, retrys+1): + try: + html = urlreq.urlopen(req, timeout=timeout).read() + break + except Exception as e: + logger.exception(e) + time.sleep(1) # to avoid request flooding + if html: + soup = BeautifulSoup(html, "lxml") + return soup + return False + + +def download_file(url: str, dest: str, headers: dict=urlreq.noheaders()): + f = open(dest, "wb") + req = urlreq.Request(url, headers=headers) + try: + image = urlreq.urlopen(req) + except ConnectionError: + print('\n [-] Connection Error') + return + f.write(image.read()) + f.close() diff --git a/logs/utility.log b/logs/utility.log new file mode 100644 index 0000000..d8bd6e2 --- /dev/null +++ b/logs/utility.log @@ -0,0 +1,61 @@ +2018-11-20 11:15:43,247 netutils ERROR +Traceback (most recent call last): + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1318, in do_open + encode_chunked=req.has_header('Transfer-encoding')) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1239, in request + self._send_request(method, url, body, headers, encode_chunked) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1285, in _send_request + self.endheaders(body, encode_chunked=encode_chunked) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1234, in endheaders + self._send_output(message_body, encode_chunked=encode_chunked) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1026, in _send_output + self.send(msg) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 964, in send + self.connect() + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1400, in connect + server_hostname=server_hostname) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 407, in wrap_socket + _context=self, _session=session) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 817, in __init__ + self.do_handshake() + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1077, in do_handshake + self._sslobj.do_handshake() + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 689, in do_handshake + self._sslobj.do_handshake() +socket.timeout: _ssl.c:830: The handshake operation timed out + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url + html = urlreq.urlopen(req, timeout=timeout).read() + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 223, in urlopen + return opener.open(url, data, timeout) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 526, in open + response = self._open(req, data) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 544, in _open + '_open', req) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 504, in _call_chain + result = func(*args) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1361, in https_open + context=self._context, check_hostname=self._check_hostname) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1320, in do_open + raise URLError(err) +urllib.error.URLError: +2018-11-20 14:11:39,064 netutils ERROR The read operation timed out +Traceback (most recent call last): + File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url + html = urlreq.urlopen(req, timeout=timeout).read() + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 462, in read + s = self._safe_read(self.length) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 612, in _safe_read + chunk = self.fp.read(min(amt, MAXAMOUNT)) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\socket.py", line 586, in readinto + return self._sock.recv_into(b) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1012, in recv_into + return self.read(nbytes, buffer) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 874, in read + return self._sslobj.read(len, buffer) + File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 631, in read + v = self._sslobj.read(len, buffer) +socket.timeout: The read operation timed out diff --git a/requirements.txt b/requirements.txt index 7c944ce..6e88d8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ beautifulsoup4==4.6.3 -bs4==0.0.1 lxml==4.2.5 typing==3.6.4 \ No newline at end of file diff --git a/riddle2.py b/riddle2.py index 1b908f8..aea2cb5 100644 --- a/riddle2.py +++ b/riddle2.py @@ -1,7 +1,3 @@ -# encoding=utf-8 -import urllib.request as urlreq -from bs4 import BeautifulSoup - import zipfile import time import os @@ -9,6 +5,8 @@ import sys import optparse import shutil +from lib import cutils, netutils, fsutils + blacklist = ['b.thumbs.redditmedia.com', 'reddit.com'] dl_dir = './.cache/' img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for @@ -21,66 +19,25 @@ hdr = { # request header errors = {} -def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'): - percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) - filled_length = int(length * iteration // total) - bar = fill * filled_length + '-' * (length - filled_length) - print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r') - sys.stdout.flush() - # Print New Line on Complete - if iteration == total: - print() - - -def spinning_cursor(): - while True: - for cursor in '|/-\\': - yield cursor - - -def get_extension(fstring): - return fstring.split('.')[-1].lower() - - -def get_soup4url(url): - """ Returns a soup for the url with 10 retrys """ - req = urlreq.Request(url, headers=hdr) - html = None - for x in range(0, 10): - try: - html = urlreq.urlopen(req).read() - break - except Exception as e: - if errors[e]: - errors[e] += 1 - else: - errors[e] = 1 - time.sleep(1) # to avoid request flooding - if html: - soup = BeautifulSoup(html, "lxml") - return soup - return False - - -def has_source(tag): +def has_source(tag: netutils.BeautifulSoup) -> bool: if tag.has_attr('src'): try: - return get_extension(tag['src']) in img_ext + return fsutils.get_extension(tag['src']) in img_ext except IndexError or KeyError: return False elif tag.has_attr('data-url'): try: tag['src'] = tag['data-url'] - return get_extension(tag['src']) in img_ext + return fsutils.get_extension(tag['src']) in img_ext except IndexError or KeyError: return False else: return False -def get_next_url(baseurl, url): +def get_next_url(baseurl: str, url: str): ids = [] - soup = get_soup4url(url) + soup = netutils.get_soup4url(url, headers=hdr) if not soup: return False for t in soup.find_all(has_source): @@ -92,16 +49,16 @@ def get_next_url(baseurl, url): pass ids = [_id for _id in ids if _id] if len(ids) == 0: - return False + return [] _id = ids[-1] next_url = '{}/?after={}'.format(baseurl, _id) return next_url -def get_img4site(url): - soup = get_soup4url(url) +def get_img4site(url: str) -> list: + soup = netutils.get_soup4url(url, headers=hdr) if not soup: - return False + return [] ret = [] sys.stdout.write('.') sys.stdout.flush() @@ -122,7 +79,7 @@ def get_img4site(url): return ret -def get_img4sub(url, length=-1): +def get_img4sub(url: str, length: int =-1) -> list: baseurl = url imgs = [] print('[~] 1/2 Getting images...') @@ -153,41 +110,30 @@ def get_img4sub(url, length=-1): return imgs -def download_images(imgs, zfile): - count = 1 +def download_images(imgs: list, zfile: zipfile.ZipFile): imgcount = len(imgs) fnames = [zinfo.filename for zinfo in zfile.infolist()] print('[~] Downloading %s images' % imgcount) - if not os.path.isdir(dl_dir): - os.mkdir(dl_dir) + pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete") + fsutils.dir_exist_guarantee(dl_dir) for img in imgs: - print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete") + pb.tick() imgname = img.split('/')[-1] - name = dl_dir + imgname + name = os.path.join(dl_dir, imgname) if os.path.isfile(name) or imgname in fnames: - count += 1 continue - f = open(name, "wb") - req = urlreq.Request(img, headers=hdr) - try: - image = urlreq.urlopen(req) - except ConnectionError: - print('\n [-] Connection Error') - return - f.write(image.read()) - f.close() + netutils.download_file(img, name, headers=hdr) zfile.write(name, imgname, zipfile.ZIP_DEFLATED) try: os.remove(name) except FileNotFoundError or PermissionError: pass time.sleep(0.1) # no don't penetrate - count += 1 added = len(zfile.infolist()) - len(fnames) print('[+] Added %s files to the zipfile' % added) -def download_subreddit(sub, count=-1, out=None): +def download_subreddit(sub: str, count: int =-1, out: str =None): mode = 'w' zname = sub + '.zip' if out: From 632164dba1c329cc42c462133fbc2cea2af0a7f6 Mon Sep 17 00:00:00 2001 From: Trivernis Date: Tue, 20 Nov 2018 19:14:03 +0100 Subject: [PATCH 11/15] Auto stash before merge of "develop" and "python-utility-scripts/develop" --- riddle2.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/riddle2.py b/riddle2.py index aea2cb5..838b9b5 100644 --- a/riddle2.py +++ b/riddle2.py @@ -48,8 +48,8 @@ def get_next_url(baseurl: str, url: str): except KeyError: pass ids = [_id for _id in ids if _id] - if len(ids) == 0: - return [] + if len(ids) == 0: # if no id was found, we can't get any further into the past + return False _id = ids[-1] next_url = '{}/?after={}'.format(baseurl, _id) return next_url @@ -164,9 +164,20 @@ def parser_init(): parser.add_option('-t', '--test', dest='test', action='store_true', default=False, help='Tests the functions of the script') + parser.add_option('-l', '--loop', dest='loop', + action='store_true', default=False, + help="""Continuing download loop. When this option is set every 5 Minutes the program searches for + new images""") return parser.parse_args() +def download_subreddits(subreddits, count, output): + for sub in subreddits: + print('[~] Downloading %s' % sub) + download_subreddit(sub, count=count, out=output) + print() + + def main(): options, subreddits = parser_init() count = options.count @@ -175,14 +186,19 @@ def main(): count = 1 subreddits = ['python'] output = 'test.zip' - for sub in subreddits: - print('[~] Downloading %s' % sub) - download_subreddit(sub, count=count, out=output) + if options.loop: + while True: + download_subreddits(subreddits, count, output) + time.sleep(300) + else: + download_subreddits(subreddits, count, output) cleanup() if options.test: os.remove(output) if len(errors.keys()) > 0: - print(errors) + print('[-] Following errors occured:') + for key in errors.keys(): + print(' %s times: %s' % (errors[key], key)) if __name__ == '__main__': From 0c64cada00680d479a1ecb60d48c9dc4b7179718 Mon Sep 17 00:00:00 2001 From: Trivernis Date: Tue, 20 Nov 2018 20:26:40 +0100 Subject: [PATCH 12/15] Lib and riddle2 changes - Added message after loop in riddle2 - Ignoring nonexistent logfile config instead of throwing an error --- .gitignore | 2 ++ lib/logutils.py | 5 +++-- riddle2.py | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 4c988b5..740d98d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ *.zip +*.log +logs .idea .cache __pycache__.* diff --git a/lib/logutils.py b/lib/logutils.py index f45a034..8a5950c 100644 --- a/lib/logutils.py +++ b/lib/logutils.py @@ -5,8 +5,9 @@ from lib import fsutils def get_logger(name=None): - fsutils.dir_exist_guarantee('logs') - fileConfig('./conf/logging.config') + if fsutils.os.path.isfile('./conf/logging.config'): + fsutils.dir_exist_guarantee('logs') + fileConfig('./conf/logging.config') if name: return logging.getLogger(name) else: diff --git a/riddle2.py b/riddle2.py index 838b9b5..63b4990 100644 --- a/riddle2.py +++ b/riddle2.py @@ -49,7 +49,7 @@ def get_next_url(baseurl: str, url: str): pass ids = [_id for _id in ids if _id] if len(ids) == 0: # if no id was found, we can't get any further into the past - return False + return None _id = ids[-1] next_url = '{}/?after={}'.format(baseurl, _id) return next_url @@ -189,6 +189,7 @@ def main(): if options.loop: while True: download_subreddits(subreddits, count, output) + print('[~] Next Download in 5 minues...') time.sleep(300) else: download_subreddits(subreddits, count, output) From 27f86db8aa3896255858e90af9459213c522d64c Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 3 Dec 2018 09:15:48 +0100 Subject: [PATCH 13/15] Modified Readme - Added Description for riddle2.py --- README.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 33945b6..59934b8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ pip install -r requirements.txt ## Scripts ### riddle.py -Downloads all images from one ore more given subreddits +Downloads all images from one or more given subreddits ```commandline Usage: riddle.py [options] [subreddits] @@ -23,4 +23,23 @@ Options: images can be found. Do only activate this if you want to download a lot of images from multiple subreddits at the same time. +``` + +## riddle2.py +Downloads all images from one or more given subreddits in a more predictable + way than riddle.py. +```commandline +Usage: riddle2.py [options] [subreddits] + +Options: + -h, --help show this help message and exit + -c COUNT, --count=COUNT + The number of images to download. + -o OUTPUT, --output=OUTPUT + The name of the output zipfile. If none is specified, + it's the subreddits name. + -t, --test Tests the functions of the script + -l, --loop Continuing download loop. When this option is set + every 5 Minutes the program searches for + new images ``` \ No newline at end of file From a472b158469db8db227aacc3cdcfc45eb60b2cd1 Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 3 Dec 2018 09:18:09 +0100 Subject: [PATCH 14/15] Added step description - Downloading is now 2/2 --- riddle2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/riddle2.py b/riddle2.py index 63b4990..a6868f1 100644 --- a/riddle2.py +++ b/riddle2.py @@ -113,7 +113,7 @@ def get_img4sub(url: str, length: int =-1) -> list: def download_images(imgs: list, zfile: zipfile.ZipFile): imgcount = len(imgs) fnames = [zinfo.filename for zinfo in zfile.infolist()] - print('[~] Downloading %s images' % imgcount) + print('[~] 2/2 Downloading %s images' % imgcount) pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete") fsutils.dir_exist_guarantee(dl_dir) for img in imgs: From 685ce33d9a3ba9b4db1549198c3f6851152cb8b6 Mon Sep 17 00:00:00 2001 From: Julius Date: Mon, 3 Dec 2018 09:24:57 +0100 Subject: [PATCH 15/15] Fixed a quality issue in riddle2.py - Removed else after return --- lib/logutils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/logutils.py b/lib/logutils.py index 8a5950c..c31ac2c 100644 --- a/lib/logutils.py +++ b/lib/logutils.py @@ -10,5 +10,4 @@ def get_logger(name=None): fileConfig('./conf/logging.config') if name: return logging.getLogger(name) - else: - return logging.getLogger() + return logging.getLogger()