python-utility-scripts/riddle2.py

# encoding=utf-8
import urllib.request as urlreq
from bs4 import BeautifulSoup

import zipfile
import time
import os
import sys
import optparse
import shutil

blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
dl_dir = './.cache/'
img_ext = ['jpg', 'jpeg', 'png']    # define the urls we are searching for
hdr = {                             # request header
    'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 
                     Chrome/23.0.1271.64 Safari/537.11""",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'}
errors = {}


def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filled_length = int(length * iteration // total)
    bar = fill * filled_length + '-' * (length - filled_length)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
    sys.stdout.flush()
    # Print New Line on Complete
    if iteration == total:
        print()


def spinning_cursor():
    while True:
        for cursor in '|/-\\':
            yield cursor


def get_extension(fstring):
    return fstring.split('.')[-1].lower()


def get_soup4url(url):
    """ Returns a soup for the url with 10 retrys """
    req = urlreq.Request(url, headers=hdr)
    html = None
    for x in range(0, 10):
        try:
            html = urlreq.urlopen(req).read()
            break
        except Exception as e:
            if errors[e]:
                errors[e] += 1
            else:
                errors[e] = 1
            time.sleep(1)  # to avoid request flooding
    if html:
        soup = BeautifulSoup(html, "lxml")
        return soup
    return False


def has_source(tag):
    if tag.has_attr('src'):
        try:
            return get_extension(tag['src']) in img_ext
        except IndexError or KeyError:
            return False
    elif tag.has_attr('data-url'):
        try:
            tag['src'] = tag['data-url']
            return get_extension(tag['src']) in img_ext
        except IndexError or KeyError:
            return False
    else:
        return False


def get_next_url(baseurl, url):
    ids = []
    soup = get_soup4url(url)
    if not soup:
        return False
    for t in soup.find_all(has_source):
        if 'redditmedia' not in t['src']:
            try:
                fname = t['data-fullname']
                ids.append(fname)
            except KeyError:
                pass
    ids = [_id for _id in ids if _id]
    if len(ids) == 0:
        return False
    _id = ids[-1]
    next_url = '{}/?after={}'.format(baseurl, _id)
    return next_url


def get_img4site(url):
    soup = get_soup4url(url)
    if not soup:
        return False
    ret = []
    sys.stdout.write('.')
    sys.stdout.flush()
    for t in soup.find_all(has_source):
        try:
            if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
                img = t['src']
                if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
                    img = url + img
                if 'http' not in img.split('/')[0]:
                    img = 'http:' + img
                if img.strip('http://').strip('https://').split('/')[0] in blacklist:
                    img = None
                if img:
                    ret.append(img)
        except KeyError:
            pass
    return ret


def get_img4sub(url, length=-1):
    baseurl = url
    imgs = []
    print('[~] 1/2 Getting images...')
    if length >= 0:
        x = 0
        while x < length:
            time.sleep(0.1)  # we don't want to flood with requests
            imgurls = get_img4site(url)
            if not imgurls:
                break
            imgs.extend(imgurls)
            x = len(imgs)
            url = get_next_url(baseurl, url)
            if not url:
                break
            sys.stdout.write('\b')
        imgs = imgs[:length]
    else:
        while url:
            time.sleep(0.1)  # we don't want to flood with requests
            imgurls = get_img4site(url)
            if not imgurls:
                break
            imgs.extend(imgurls)
            url = get_next_url(baseurl, url)
    print('[+] Found %s images' % len(imgs))
    return imgs


def download_images(imgs, zfile):
    count = 1
    imgcount = len(imgs)
    fnames = [zinfo.filename for zinfo in zfile.infolist()]
    print('[~] Downloading %s images' % imgcount)
    if not os.path.isdir(dl_dir):
        os.mkdir(dl_dir)
    for img in imgs:
        print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete")
        imgname = img.split('/')[-1]
        name = dl_dir + imgname
        if os.path.isfile(name) or imgname in fnames:
            count += 1
            continue
        f = open(name, "wb")
        req = urlreq.Request(img, headers=hdr)
        try:
            image = urlreq.urlopen(req)
        except ConnectionError:
            print('\n [-] Connection Error')
            return
        f.write(image.read())
        f.close()
        zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
        try:
            os.remove(name)
        except FileNotFoundError or PermissionError:
            pass
        time.sleep(0.1)  # no don't penetrate
        count += 1
    added = len(zfile.infolist()) - len(fnames)
    print('[+] Added %s files to the zipfile' % added)


def download_subreddit(sub, count=-1, out=None):
    mode = 'w'
    zname = sub + '.zip'
    if out:
        zname = out
    if os.path.isfile(zname):
        mode = 'a'
    url = 'https://old.reddit.com/r/%s/' % sub
    imgs = get_img4sub(url, length=count)
    zfile = zipfile.ZipFile(zname, mode)
    download_images(imgs, zfile)
    zfile.close()


def cleanup():
    print('[~] Cleanup...')
    if os.path.isdir(dl_dir):
        shutil.rmtree(dl_dir)


def parser_init():
    parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
    parser.add_option('-c', '--count', dest='count',
                      type='int', default=-1,
                      help='The number of images to download.')
    parser.add_option('-o', '--output', dest='output',
                      type='str', default=None,
                      help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')
    parser.add_option('-t', '--test', dest='test',
                      action='store_true', default=False,
                      help='Tests the functions of the script')
    return parser.parse_args()


def main():
    options, subreddits = parser_init()
    count = options.count
    output = options.output
    if options.test:
        count = 1
        subreddits = ['python']
        output = 'test.zip'
    for sub in subreddits:
        print('[~] Downloading %s' % sub)
        download_subreddit(sub, count=count, out=output)
    cleanup()
    if options.test:
        os.remove(output)
    if len(errors.keys()) > 0:
        print(errors)


if __name__ == '__main__':
    main()
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`# encoding=utf-8`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`import urllib.request as urlreq`
			`from bs4 import BeautifulSoup`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`import zipfile`
			`import time`
			`import os`
			`import sys`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`import optparse`
			`import shutil`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago
			`blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']`
			`dl_dir = './.cache/'`
			`img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for`
			`hdr = { # request header`
			`'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)`
			`Chrome/23.0.1271.64 Safari/537.11""",`
			`'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',`
			`'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',`
			`'Connection': 'keep-alive'}`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`errors = {}`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago

Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=50, fill='█'):`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))`
			`filled_length = int(length * iteration // total)`
			`bar = fill * filled_length + '-' * (length - filled_length)`
			`print('\r%s \|%s\| %s%% %s' % (prefix, bar, percent, suffix), end='\r')`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`sys.stdout.flush()`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`# Print New Line on Complete`
			`if iteration == total:`
			`print()`


			`def spinning_cursor():`
			`while True:`
			`for cursor in '\|/-\\':`
			`yield cursor`


			`def get_extension(fstring):`
			`return fstring.split('.')[-1].lower()`


			`def get_soup4url(url):`
			`""" Returns a soup for the url with 10 retrys """`
			`req = urlreq.Request(url, headers=hdr)`
			`html = None`
			`for x in range(0, 10):`
			`try:`
			`html = urlreq.urlopen(req).read()`
			`break`
			`except Exception as e:`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`if errors[e]:`
			`errors[e] += 1`
			`else:`
			`errors[e] = 1`
Changes in riddle2 - Non-result prints are now [~] - Waiting 1 second before connection retry 6 years ago			`time.sleep(1) # to avoid request flooding`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`if html:`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`soup = BeautifulSoup(html, "lxml")`
			`return soup`
			`return False`


			`def has_source(tag):`
			`if tag.has_attr('src'):`
			`try:`
			`return get_extension(tag['src']) in img_ext`
			`except IndexError or KeyError:`
			`return False`
			`elif tag.has_attr('data-url'):`
			`try:`
			`tag['src'] = tag['data-url']`
			`return get_extension(tag['src']) in img_ext`
			`except IndexError or KeyError:`
			`return False`
			`else:`
			`return False`


			`def get_next_url(baseurl, url):`
			`ids = []`
			`soup = get_soup4url(url)`
			`if not soup:`
			`return False`
			`for t in soup.find_all(has_source):`
			`if 'redditmedia' not in t['src']:`
			`try:`
			`fname = t['data-fullname']`
			`ids.append(fname)`
			`except KeyError:`
			`pass`
			`ids = [_id for _id in ids if _id]`
			`if len(ids) == 0:`
			`return False`
			`_id = ids[-1]`
			`next_url = '{}/?after={}'.format(baseurl, _id)`
			`return next_url`


			`def get_img4site(url):`
			`soup = get_soup4url(url)`
			`if not soup:`
			`return False`
			`ret = []`
			`sys.stdout.write('.')`
			`sys.stdout.flush()`
			`for t in soup.find_all(has_source):`
			`try:`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`if 'redditmedia' not in t['src'] and 'icon' not in t['src']:`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`img = t['src']`
			`if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:`
			`img = url + img`
			`if 'http' not in img.split('/')[0]:`
			`img = 'http:' + img`
			`if img.strip('http://').strip('https://').split('/')[0] in blacklist:`
			`img = None`
			`if img:`
			`ret.append(img)`
			`except KeyError:`
			`pass`
			`return ret`


			`def get_img4sub(url, length=-1):`
			`baseurl = url`
			`imgs = []`
Changes in riddle2 - Non-result prints are now [~] - Waiting 1 second before connection retry 6 years ago			`print('[~] 1/2 Getting images...')`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`if length >= 0:`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`x = 0`
			`while x < length:`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`time.sleep(0.1) # we don't want to flood with requests`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`imgurls = get_img4site(url)`
			`if not imgurls:`
			`break`
			`imgs.extend(imgurls)`
			`x = len(imgs)`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`url = get_next_url(baseurl, url)`
			`if not url:`
			`break`
			`sys.stdout.write('\b')`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`imgs = imgs[:length]`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`else:`
			`while url:`
			`time.sleep(0.1) # we don't want to flood with requests`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`imgurls = get_img4site(url)`
			`if not imgurls:`
			`break`
			`imgs.extend(imgurls)`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`url = get_next_url(baseurl, url)`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`print('[+] Found %s images' % len(imgs))`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`return imgs`


			`def download_images(imgs, zfile):`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`count = 1`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`imgcount = len(imgs)`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`fnames = [zinfo.filename for zinfo in zfile.infolist()]`
Changes in riddle2 - Non-result prints are now [~] - Waiting 1 second before connection retry 6 years ago			`print('[~] Downloading %s images' % imgcount)`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`if not os.path.isdir(dl_dir):`
			`os.mkdir(dl_dir)`
			`for img in imgs:`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete")`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`imgname = img.split('/')[-1]`
			`name = dl_dir + imgname`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`if os.path.isfile(name) or imgname in fnames:`
			`count += 1`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`continue`
			`f = open(name, "wb")`
			`req = urlreq.Request(img, headers=hdr)`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`try:`
			`image = urlreq.urlopen(req)`
			`except ConnectionError:`
			`print('\n [-] Connection Error')`
			`return`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`f.write(image.read())`
			`f.close()`
			`zfile.write(name, imgname, zipfile.ZIP_DEFLATED)`
			`try:`
			`os.remove(name)`
			`except FileNotFoundError or PermissionError:`
			`pass`
			`time.sleep(0.1) # no don't penetrate`
			`count += 1`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`added = len(zfile.infolist()) - len(fnames)`
			`print('[+] Added %s files to the zipfile' % added)`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago

Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`def download_subreddit(sub, count=-1, out=None):`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`mode = 'w'`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`zname = sub + '.zip'`
			`if out:`
			`zname = out`
			`if os.path.isfile(zname):`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`mode = 'a'`
			`url = 'https://old.reddit.com/r/%s/' % sub`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`imgs = get_img4sub(url, length=count)`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`zfile = zipfile.ZipFile(zname, mode)`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`download_images(imgs, zfile)`
			`zfile.close()`


Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`def cleanup():`
Changes in riddle2 - Non-result prints are now [~] - Waiting 1 second before connection retry 6 years ago			`print('[~] Cleanup...')`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`if os.path.isdir(dl_dir):`
			`shutil.rmtree(dl_dir)`


			`def parser_init():`
			`parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")`
			`parser.add_option('-c', '--count', dest='count',`
			`type='int', default=-1,`
			`help='The number of images to download.')`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`parser.add_option('-o', '--output', dest='output',`
			`type='str', default=None,`
			`help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`parser.add_option('-t', '--test', dest='test',`
			`action='store_true', default=False,`
			`help='Tests the functions of the script')`
			`return parser.parse_args()`


			`def main():`
			`options, subreddits = parser_init()`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`count = options.count`
			`output = options.output`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`if options.test:`
			`count = 1`
			`subreddits = ['python']`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`output = 'test.zip'`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`for sub in subreddits:`
Changes in riddle2 - Non-result prints are now [~] - Waiting 1 second before connection retry 6 years ago			`print('[~] Downloading %s' % sub)`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`download_subreddit(sub, count=count, out=output)`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`cleanup()`
Changes to riddle2.py - progress bar is only half the width - modified test option - added option to save everything in one zipfile 6 years ago			`if options.test:`
			`os.remove(output)`
			`if len(errors.keys()) > 0:`
			`print(errors)`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago

Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`if __name__ == '__main__':`
Added full functionalities to riddle2.py - -c for number of images to download 6 years ago			`main()`