python-utility-scripts/riddle.py

import urllib.request as urlreq
from typing import List, Dict

from bs4 import BeautifulSoup
import os
import zipfile
import optparse
import asyncio
import shutil

redditurl: str = 'https://old.reddit.com/r/%s'      # the url for reddit with %s to insert the subreddit name
dl_dir: str = './.cache/'  # Format must be ./      # the directory where files are cached. Will be created if it doesn't exist
img_ext: List[str] = ['jpg', 'png', 'bmp']          # file extensions that are images
blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com']   # where images shouldn't be downloaded from
hdr: Dict[str, str] = {                             # request header
    'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 
                     Chrome/23.0.1271.64 Safari/537.11""",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'}


# prints a progress bar
def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filled_length = int(length * iteration // total)
    bar = fill * filled_length + '-' * (length - filled_length)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
    # Print New Line on Complete
    if iteration == total:
        print()


# returns a soup for the given url
async def request_soup(url):
    req = urlreq.Request(url, headers=hdr)
    html = None
    for x in range(0, 10):
        try:
            html = urlreq.urlopen(req).read()
            break
        except Exception as e:
            print('[-]', e)
            await asyncio.sleep(1)
    soup = BeautifulSoup(html, "lxml")
    return soup


# returns all images for the given url
async def get_img_as(url):
    soup = await  request_soup(url)
    ret = []
    for t in soup.find_all(has_source):
        if 'redditmedia' not in t['src']:
            try:
                ret.append(t['src'])
            except KeyError:
                pass
    return ret


# returns the last post id in the given reddit page
async def get_next(url):
    ids = []
    soup = await request_soup(url)
    for t in soup.find_all(has_source):
        if 'redditmedia' not in t['src']:
            try:
                fname = t['data-fullname']
                ids.append(fname)
            except KeyError:
                pass
    return [_id for _id in ids if _id][-1]


# returns if the given tag has a source attribute that is an image
def has_source(tag):
    if tag.has_attr('src'):
        try:
            return tag['src'].split('.')[-1].lower() in img_ext
        except IndexError or KeyError:
            return False
    elif tag.has_attr('data-url'):
        try:
            tag['src'] = tag['data-url']
            return tag['src'].split('.')[-1].lower() in img_ext
        except KeyError or KeyError:
            return False
    else:
        return False


# downloads all images for the given url and puts them in a zipfile
async def download_async(url, zfile=None, test=False):
    images = await get_img_as(url)
    print('[+] Found %s images' % len(images))
    logmsg = ""
    imgcount = len(images)
    savedcount = 0
    count = 0
    print_progress(count, imgcount, prefix="Downloading: ", suffix="Complete")
    for img in images:
        print_progress(count+1, imgcount, prefix="Downloading: ", suffix="Complete")
        count += 1
        if test:
            continue
        try:
            if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
                img = url + img
            if 'http' not in img.split('/')[0]:
                img = 'http:' + img
            if img.strip('http://').strip('https://').split('/')[0] in blacklist:
                img = None
                continue
            imgname = img.split('/')[-1]
            name = dl_dir + imgname
            if os.path.isfile(name):
                continue
            f = open(name, "wb")
            req = urlreq.Request(img, headers=hdr)
            image = urlreq.urlopen(req)
            f.write(image.read())
            f.close()
            zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
            try:
                os.remove(name)
            except FileNotFoundError or PermissionError:
                pass
            savedcount += 1
            await asyncio.sleep(0.25)
        except Exception as error:
            logmsg += '[-] Failed with %s %s\n' % (img, error)
    print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url))


# loops over reddit-pages until no more images are found
async def dl_loop(section, zfile, loop, chaos=False, test=False):
    baseurl = redditurl % section
    url = baseurl
    if chaos:
        loop.create_task(download_async(url, zfile, test))
    else:
        await loop.create_task(download_async(url, zfile, test))
    while True:
        print('[*] Getting Images from %s' % url)
        try:
            after = await get_next(url)
            url = '{}/?after={}'.format(baseurl, after)
            if chaos:
                loop.create_task(download_async(url, zfile, test))
            else:
                await loop.create_task(download_async(url, zfile, test))
        except Exception as ex:
            print('[-]', ex)
            zfile.close()
            break
        finally:
            await asyncio.sleep(0.1)


# the main function
def main(sections, opts):
    chaos = opts.chaos
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)
    zfiles = {}
    for sect in sections:
        mode = 'w'
        if os.path.isfile(sect + '.zip'):
            mode = 'a'
        zfiles[sect] = zipfile.ZipFile('%s.zip' % sect, mode)
    loop = asyncio.get_event_loop()
    try:
        for sect in sections:
            if chaos:
                loop.create_task(loop.create_task(
                    dl_loop(sect, zfiles[sect], loop, chaos=True, test=opts.test)))
            else:
                loop.run_until_complete(loop.create_task(
                    dl_loop(sect, zfiles[sect], loop, test=opts.test)))
        if chaos:
            loop.run_forever()
    except KeyboardInterrupt:
        for sect in sections:
            try:
                zfiles[sect].close()
            except Exception as error:
                print(error)
    finally:
        shutil.rmtree(dl_dir)


if __name__ == '__main__':
    parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
    parser.add_option('-c', '--chaos', dest='chaos',
                      action='store_true', default=False,
                      help=""" Doesn't wait for previous downloads to finish and doesn't exit when no more
                      images can be found. Do only activate this if you want to download a lot of images
                      from multiple subreddits at the same time. Only option to exit is CTRL + C.""")
    parser.add_option('-t', '--test', dest='test',
                      action='store_true', default=False,
                      help='Tests the functions of the script')
    options, sects = parser.parse_args()
    print('[~] Recieved subreddits %s' % ', '.join(sects))
    main(sects, opts=options)
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`import urllib.request as urlreq`
			`from typing import List, Dict`

			`from bs4 import BeautifulSoup`
			`import os`
			`import zipfile`
			`import optparse`
			`import asyncio`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`import shutil`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago
Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`redditurl: str = 'https://old.reddit.com/r/%s' # the url for reddit with %s to insert the subreddit name`
			`dl_dir: str = './.cache/' # Format must be ./ # the directory where files are cached. Will be created if it doesn't exist`
			`img_ext: List[str] = ['jpg', 'png', 'bmp'] # file extensions that are images`
			`blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com'] # where images shouldn't be downloaded from`
			`hdr: Dict[str, str] = { # request header`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)`
			`Chrome/23.0.1271.64 Safari/537.11""",`
			`'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',`
			`'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',`
			`'Connection': 'keep-alive'}`


Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# prints a progress bar`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):`
			`percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))`
			`filled_length = int(length * iteration // total)`
			`bar = fill * filled_length + '-' * (length - filled_length)`
			`print('\r%s \|%s\| %s%% %s' % (prefix, bar, percent, suffix), end='\r')`
			`# Print New Line on Complete`
			`if iteration == total:`
			`print()`


Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# returns a soup for the given url`
Created new Function out of duplicated code closes #2 6 years ago			`async def request_soup(url):`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`req = urlreq.Request(url, headers=hdr)`
			`html = None`
			`for x in range(0, 10):`
			`try:`
			`html = urlreq.urlopen(req).read()`
			`break`
			`except Exception as e:`
			`print('[-]', e)`
			`await asyncio.sleep(1)`
			`soup = BeautifulSoup(html, "lxml")`
Created new Function out of duplicated code closes #2 6 years ago			`return soup`


Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# returns all images for the given url`
Created new Function out of duplicated code closes #2 6 years ago			`async def get_img_as(url):`
			`soup = await request_soup(url)`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`ret = []`
			`for t in soup.find_all(has_source):`
			`if 'redditmedia' not in t['src']:`
			`try:`
			`ret.append(t['src'])`
			`except KeyError:`
			`pass`
			`return ret`


Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# returns the last post id in the given reddit page`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`async def get_next(url):`
			`ids = []`
Created new Function out of duplicated code closes #2 6 years ago			`soup = await request_soup(url)`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`for t in soup.find_all(has_source):`
			`if 'redditmedia' not in t['src']:`
			`try:`
			`fname = t['data-fullname']`
			`ids.append(fname)`
			`except KeyError:`
			`pass`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`return [_id for _id in ids if _id][-1]`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago

Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# returns if the given tag has a source attribute that is an image`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`def has_source(tag):`
			`if tag.has_attr('src'):`
			`try:`
			`return tag['src'].split('.')[-1].lower() in img_ext`
			`except IndexError or KeyError:`
			`return False`
			`elif tag.has_attr('data-url'):`
			`try:`
			`tag['src'] = tag['data-url']`
			`return tag['src'].split('.')[-1].lower() in img_ext`
			`except KeyError or KeyError:`
			`return False`
			`else:`
			`return False`


Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# downloads all images for the given url and puts them in a zipfile`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`async def download_async(url, zfile=None, test=False):`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`images = await get_img_as(url)`
			`print('[+] Found %s images' % len(images))`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`logmsg = ""`
			`imgcount = len(images)`
			`savedcount = 0`
			`count = 0`
			`print_progress(count, imgcount, prefix="Downloading: ", suffix="Complete")`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`for img in images:`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`print_progress(count+1, imgcount, prefix="Downloading: ", suffix="Complete")`
Started working on riddle2 New Features: - First fetches all image urls (or a specific count) and starts the download afterwards -> The user can see a progress bar stating the total download progress instead of the progress of a single page 6 years ago			`count += 1`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`if test:`
			`continue`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`try:`
			`if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:`
			`img = url + img`
			`if 'http' not in img.split('/')[0]:`
			`img = 'http:' + img`
			`if img.strip('http://').strip('https://').split('/')[0] in blacklist:`
			`img = None`
			`continue`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`imgname = img.split('/')[-1]`
			`name = dl_dir + imgname`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`if os.path.isfile(name):`
			`continue`
			`f = open(name, "wb")`
			`req = urlreq.Request(img, headers=hdr)`
			`image = urlreq.urlopen(req)`
			`f.write(image.read())`
			`f.close()`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`zfile.write(name, imgname, zipfile.ZIP_DEFLATED)`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`try:`
			`os.remove(name)`
			`except FileNotFoundError or PermissionError:`
			`pass`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`savedcount += 1`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`await asyncio.sleep(0.25)`
			`except Exception as error:`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`logmsg += '[-] Failed with %s %s\n' % (img, error)`
			`print('[+] %s images downloaded \| %s finished %s' % (savedcount, logmsg, url))`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago

Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# loops over reddit-pages until no more images are found`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`async def dl_loop(section, zfile, loop, chaos=False, test=False):`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`baseurl = redditurl % section`
			`url = baseurl`
			`if chaos:`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`loop.create_task(download_async(url, zfile, test))`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`else:`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`await loop.create_task(download_async(url, zfile, test))`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`while True:`
			`print('[*] Getting Images from %s' % url)`
			`try:`
			`after = await get_next(url)`
			`url = '{}/?after={}'.format(baseurl, after)`
			`if chaos:`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`loop.create_task(download_async(url, zfile, test))`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`else:`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`await loop.create_task(download_async(url, zfile, test))`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`except Exception as ex:`
			`print('[-]', ex)`
			`zfile.close()`
			`break`
			`finally:`
			`await asyncio.sleep(0.1)`


Added comments to riddle.py Added function descriptions and comments to global variables. 6 years ago			`# the main function`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`def main(sections, opts):`
			`chaos = opts.chaos`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`if not os.path.exists(dl_dir):`
			`os.makedirs(dl_dir)`
			`zfiles = {}`
			`for sect in sections:`
			`mode = 'w'`
			`if os.path.isfile(sect + '.zip'):`
			`mode = 'a'`
			`zfiles[sect] = zipfile.ZipFile('%s.zip' % sect, mode)`
			`loop = asyncio.get_event_loop()`
			`try:`
			`for sect in sections:`
			`if chaos:`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`loop.create_task(loop.create_task(`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`dl_loop(sect, zfiles[sect], loop, chaos=True, test=opts.test)))`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`else:`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`loop.run_until_complete(loop.create_task(`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`dl_loop(sect, zfiles[sect], loop, test=opts.test)))`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`if chaos:`
			`loop.run_forever()`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`except KeyboardInterrupt:`
			`for sect in sections:`
			`try:`
			`zfiles[sect].close()`
			`except Exception as error:`
			`print(error)`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`finally:`
			`shutil.rmtree(dl_dir)`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago

			`if __name__ == '__main__':`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`parser.add_option('-c', '--chaos', dest='chaos',`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`action='store_true', default=False,`
			`help=""" Doesn't wait for previous downloads to finish and doesn't exit when no more`
			`images can be found. Do only activate this if you want to download a lot of images`
			`from multiple subreddits at the same time. Only option to exit is CTRL + C.""")`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`parser.add_option('-t', '--test', dest='test',`
			`action='store_true', default=False,`
			`help='Tests the functions of the script')`
Added Skripts and Setup - Added riddle - A script for downloading images from subreddits - Added setup.py 6 years ago			`options, sects = parser.parse_args()`
Improvements and README edit - files are stored in cache directory that will be deleted afterwards - script exits in no-chaos mode - added help description 6 years ago			`print('[~] Recieved subreddits %s' % ', '.join(sects))`
Prettier Output - Added Progressbar - changed testing settings 6 years ago			`main(sects, opts=options)`