Merge pull request #7 from Trivernis/develop

Develop
6 years ago · 2e132b0bf1
parent 451802d552 685ce33d9a
commit 2e132b0bf1
13 changed files with 491 additions and 10 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -45,11 +45,17 @@ jobs:
      # https://pytest.org
      # https://nose.readthedocs.io
      - run:
-          name: run tests
+          name: run tests for riddle.py
          command: |
            . venv/bin/activate
            python riddle.py -t Python
      - run:
          name: run tests for riddle2.py
          command: |
            . venv/bin/activate
            python riddle2.py -t
      - store_artifacts:
          path: test-reports
          destination: test-reports
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
 *.zip
 *.log
 logs
 .idea
 .cache
 __pycache__.*
 __pycache__
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ pip install -r requirements.txt
 ## Scripts
 ### riddle.py
-Downloads all images from one ore more given subreddits
+Downloads all images from one or more given subreddits
 ```commandline
 Usage: riddle.py [options] [subreddits]
@ -24,3 +24,22 @@ Options:
               download a lot of images                       from multiple
               subreddits at the same time.
 ```
 ## riddle2.py
 Downloads all images from one  or more given subreddits in a more predictable
 way than riddle.py.
 ```commandline
 Usage: riddle2.py [options] [subreddits]
 Options:
  -h, --help            show this help message and exit
  -c COUNT, --count=COUNT
                        The number of images to download.
  -o OUTPUT, --output=OUTPUT
                        The name of the output zipfile. If none is specified,
                        it's the subreddits name.
  -t, --test            Tests the functions of the script
  -l, --loop            Continuing download loop. When this option is set
                        every 5 Minutes the program searches for
                        new images
 ```
--- a/conf/logging.config
+++ b/conf/logging.config
@ -0,0 +1,27 @@
 [loggers]
 keys=root
 [handlers]
 keys=stream_handler, file_handler
 [formatters]
 keys=formatter
 [logger_root]
 level=DEBUG
 handlers=stream_handler, file_handler
 [handler_stream_handler]
 class=StreamHandler
 level=FATAL
 formatter=formatter
 args=(sys.stderr,)
 [handler_file_handler]
 class=handlers.TimedRotatingFileHandler
 level=DEBUG
 formatter=formatter
 args=('./logs/utility.log','midnight',1,5,'utf-8',False,True,)
 [formatter_formatter]
 format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s
--- a/lib/cutils.py
+++ b/lib/cutils.py
@ -0,0 +1,31 @@
 class ProgressBar:
    def __init__(self, total=100, prefix='', suffix='', length=50, fill='█'):
        self.prefix = prefix
        self.suffix = suffix
        self.fill = fill
        self.length = length
        self.total = total
        self.progress = 0
    def tick(self):
        self.progress += 1
        self._print_progress()
    def setprogress(self, progress):
        self.progress = progress
        self._print_progress()
    def _print_progress(self):
        iteration = self.progress
        total = self.total
        prefix = self.prefix
        suffix = self.suffix
        percent = ("{0:." + str(1) + "f}").format(100 * (iteration / float(total)))
        filled_length = int(self.length * iteration // total)
        bar = self.fill * filled_length + '-' * (self.length - filled_length)
        print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
        # Print New Line on Complete
        if iteration == total:
            print()
--- a/lib/fsutils.py
+++ b/lib/fsutils.py
@ -0,0 +1,70 @@
 import os
 import shutil
 def dir_exist_guarantee(dirpath):
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)
 def get_extension(fname: str):
    return fname.split('.')[-1]
 class FileInfo:
    """ A simple wrapper around the os path functions that returns basic file info
     and let's you peform basic file tasks."""
    def __init__(self, fname: str):
        self._init_info(fname)
    def _init_info(self, fname):
        """ Set's all the required variables for performing file tasks and to
         access when working with the file object. """
        # stringvars
        self._path = os.path.normpath(fname.replace('\\', '/')).encode('utf-8')
        if not os.path.isfile(self._path):
            raise Exception("Not a File")
        self._extless, self.extension = os.path.splitext(self._path)
        self.dirname, self.basename = os.path.split(self._path)
        self.fullname = os.path.join(self.dirname, self.basename)
        # boolvars
        self.exist = os.path.exists(self.fullname)
        self.ismount = self.islink = False
        if self.exist:
            self.ismount = os.path.ismount(self.fullname)
            self.islink = os.path.islink(self.fullname)
    def delete(self):
        """ Deletes the file if it exists.
         Does nothing, if it does not exist."""
        if self.exist:
            os.remove(self.fullname)
    def create(self):
        """ Creates the file if it doesn't exist.
         Does nothing, if it does."""
        if not self.exist:
            with open(self.fullname, 'w') as f:
                f.write('');
    def reset(self):
        """ Opens the file and writes nothing into it. """
        with open(self.fullname, 'w') as f:
            f.write('')
    def open(self, mode: str):
        """ Returns the file opened with the open method. """
        self.create()
        return open(self.fullname, mode)
    def copy(self, dest: str):
        if self.exist:
            shutil.copyfile(self.fullname, dest)
            return FileInfo(dest)
    def move(self, dest: str):
        if self.exist:
            shutil.move(self.fullname, dest)
            self._init_info(dest)
        else:
            self._init_info(dest)
--- a/lib/logs/utility.log
+++ b/lib/logs/utility.log
--- a/lib/logutils.py
+++ b/lib/logutils.py
@ -0,0 +1,13 @@
 import logging
 from logging.config import fileConfig
 from lib import fsutils
 def get_logger(name=None):
    if fsutils.os.path.isfile('./conf/logging.config'):
        fsutils.dir_exist_guarantee('logs')
        fileConfig('./conf/logging.config')
    if name:
        return logging.getLogger(name)
    return logging.getLogger()
--- a/lib/netutils.py
+++ b/lib/netutils.py
@ -0,0 +1,37 @@
 import urllib.request as urlreq
 import time
 from bs4 import BeautifulSoup
 from lib import logutils
 logger = logutils.get_logger('netutils')
 def get_soup4url(url: str, retrys: int =2, headers: dict=urlreq.noheaders(), timeout: int =30) -> BeautifulSoup:
    """ Returns a soup for the url """
    req = urlreq.Request(url, headers=headers)
    html = None
    for _ in range(0, retrys+1):
        try:
            html = urlreq.urlopen(req, timeout=timeout).read()
            break
        except Exception as e:
            logger.exception(e)
            time.sleep(1)  # to avoid request flooding
    if html:
        soup = BeautifulSoup(html, "lxml")
        return soup
    return False
 def download_file(url: str, dest: str, headers: dict=urlreq.noheaders()):
    f = open(dest, "wb")
    req = urlreq.Request(url, headers=headers)
    try:
        image = urlreq.urlopen(req)
    except ConnectionError:
        print('\n [-] Connection Error')
        return
    f.write(image.read())
    f.close()
--- a/logs/utility.log
+++ b/logs/utility.log
@ -0,0 +1,61 @@
 2018-11-20 11:15:43,247 netutils     ERROR    <urlopen error _ssl.c:830: The handshake operation timed out>
 Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1318, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1400, in connect
    server_hostname=server_hostname)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 407, in wrap_socket
    _context=self, _session=session)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 817, in __init__
    self.do_handshake()
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1077, in do_handshake
    self._sslobj.do_handshake()
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 689, in do_handshake
    self._sslobj.do_handshake()
 socket.timeout: _ssl.c:830: The handshake operation timed out
 During handling of the above exception, another exception occurred:
 Traceback (most recent call last):
  File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
    html = urlreq.urlopen(req, timeout=timeout).read()
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 526, in open
    response = self._open(req, data)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 544, in _open
    '_open', req)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 504, in _call_chain
    result = func(*args)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1361, in https_open
    context=self._context, check_hostname=self._check_hostname)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1320, in do_open
    raise URLError(err)
 urllib.error.URLError: <urlopen error _ssl.c:830: The handshake operation timed out>
 2018-11-20 14:11:39,064 netutils     ERROR    The read operation timed out
 Traceback (most recent call last):
  File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
    html = urlreq.urlopen(req, timeout=timeout).read()
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 462, in read
    s = self._safe_read(self.length)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 612, in _safe_read
    chunk = self.fp.read(min(amt, MAXAMOUNT))
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\socket.py", line 586, in readinto
    return self._sock.recv_into(b)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1012, in recv_into
    return self.read(nbytes, buffer)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 874, in read
    return self._sslobj.read(len, buffer)
  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 631, in read
    v = self._sslobj.read(len, buffer)
 socket.timeout: The read operation timed out
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
 beautifulsoup4==4.6.3
 bs4==0.0.1
 lxml==4.2.5
 typing==3.6.4
--- a/riddle.py
+++ b/riddle.py
@ -8,11 +8,11 @@ import optparse
 import asyncio
 import shutil
-redditurl: str = 'https://old.reddit.com/r/%s'
+redditurl: str = 'https://old.reddit.com/r/%s'      # the url for reddit with %s to insert the subreddit name
-dl_dir: str = './.cache/'  # Format must be ./
+dl_dir: str = './.cache/'  # Format must be ./      # the directory where files are cached. Will be created if it doesn't exist
-img_ext: List[str] = ['jpg', 'png', 'bmp']
+img_ext: List[str] = ['jpg', 'png', 'bmp']          # file extensions that are images
-blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com']
+blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com']   # where images shouldn't be downloaded from
-hdr: Dict[str, str] = {
+hdr: Dict[str, str] = {                             # request header
    'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 
                     Chrome/23.0.1271.64 Safari/537.11""",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@ -20,6 +20,7 @@ hdr: Dict[str, str] = {
    'Connection': 'keep-alive'}
 # prints a progress bar
 def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filled_length = int(length * iteration // total)
@ -30,6 +31,7 @@ def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=10
        print()
 # returns a soup for the given url
 async def request_soup(url):
    req = urlreq.Request(url, headers=hdr)
    html = None
@ -44,6 +46,7 @@ async def request_soup(url):
    return soup
 # returns all images for the given url
 async def get_img_as(url):
    soup = await  request_soup(url)
    ret = []
@ -56,6 +59,7 @@ async def get_img_as(url):
    return ret
 # returns the last post id in the given reddit page
 async def get_next(url):
    ids = []
    soup = await request_soup(url)
@ -69,6 +73,7 @@ async def get_next(url):
    return [_id for _id in ids if _id][-1]
 # returns if the given tag has a source attribute that is an image
 def has_source(tag):
    if tag.has_attr('src'):
        try:
@ -85,6 +90,7 @@ def has_source(tag):
        return False
 # downloads all images for the given url and puts them in a zipfile
 async def download_async(url, zfile=None, test=False):
    images = await get_img_as(url)
    print('[+] Found %s images' % len(images))
@ -95,7 +101,7 @@ async def download_async(url, zfile=None, test=False):
    print_progress(count, imgcount, prefix="Downloading: ", suffix="Complete")
    for img in images:
        print_progress(count+1, imgcount, prefix="Downloading: ", suffix="Complete")
-        count+=1
+        count += 1
        if test:
            continue
        try:
@ -127,6 +133,7 @@ async def download_async(url, zfile=None, test=False):
    print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url))
 # loops over reddit-pages until no more images are found
 async def dl_loop(section, zfile, loop, chaos=False, test=False):
    baseurl = redditurl % section
    url = baseurl
@ -151,6 +158,7 @@ async def dl_loop(section, zfile, loop, chaos=False, test=False):
            await asyncio.sleep(0.1)
 # the main function
 def main(sections, opts):
    chaos = opts.chaos
    if not os.path.exists(dl_dir):
--- a/riddle2.py
+++ b/riddle2.py
@ -0,0 +1,206 @@
 import zipfile
 import time
 import os
 import sys
 import optparse
 import shutil
 from lib import cutils, netutils, fsutils
 blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
 dl_dir = './.cache/'
 img_ext = ['jpg', 'jpeg', 'png']    # define the urls we are searching for
 hdr = {                             # request header
    'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 
                     Chrome/23.0.1271.64 Safari/537.11""",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive'}
 errors = {}
 def has_source(tag: netutils.BeautifulSoup) -> bool:
    if tag.has_attr('src'):
        try:
            return fsutils.get_extension(tag['src']) in img_ext
        except IndexError or KeyError:
            return False
    elif tag.has_attr('data-url'):
        try:
            tag['src'] = tag['data-url']
            return fsutils.get_extension(tag['src']) in img_ext
        except IndexError or KeyError:
            return False
    else:
        return False
 def get_next_url(baseurl: str, url: str):
    ids = []
    soup = netutils.get_soup4url(url, headers=hdr)
    if not soup:
        return False
    for t in soup.find_all(has_source):
        if 'redditmedia' not in t['src']:
            try:
                fname = t['data-fullname']
                ids.append(fname)
            except KeyError:
                pass
    ids = [_id for _id in ids if _id]
    if len(ids) == 0:  # if no id was found, we can't get any further into the past
        return None
    _id = ids[-1]
    next_url = '{}/?after={}'.format(baseurl, _id)
    return next_url
 def get_img4site(url: str) -> list:
    soup = netutils.get_soup4url(url, headers=hdr)
    if not soup:
        return []
    ret = []
    sys.stdout.write('.')
    sys.stdout.flush()
    for t in soup.find_all(has_source):
        try:
            if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
                img = t['src']
                if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
                    img = url + img
                if 'http' not in img.split('/')[0]:
                    img = 'http:' + img
                if img.strip('http://').strip('https://').split('/')[0] in blacklist:
                    img = None
                if img:
                    ret.append(img)
        except KeyError:
            pass
    return ret
 def get_img4sub(url: str, length: int =-1) -> list:
    baseurl = url
    imgs = []
    print('[~] 1/2 Getting images...')
    if length >= 0:
        x = 0
        while x < length:
            time.sleep(0.1)  # we don't want to flood with requests
            imgurls = get_img4site(url)
            if not imgurls:
                break
            imgs.extend(imgurls)
            x = len(imgs)
            url = get_next_url(baseurl, url)
            if not url:
                break
            sys.stdout.write('\b')
        imgs = imgs[:length]
    else:
        while url:
            time.sleep(0.1)  # we don't want to flood with requests
            imgurls = get_img4site(url)
            if not imgurls:
                break
            imgs.extend(imgurls)
            url = get_next_url(baseurl, url)
    print()
    print('[+] Found %s images' % len(imgs))
    return imgs
 def download_images(imgs: list, zfile: zipfile.ZipFile):
    imgcount = len(imgs)
    fnames = [zinfo.filename for zinfo in zfile.infolist()]
    print('[~] 2/2 Downloading %s images' % imgcount)
    pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete")
    fsutils.dir_exist_guarantee(dl_dir)
    for img in imgs:
        pb.tick()
        imgname = img.split('/')[-1]
        name = os.path.join(dl_dir, imgname)
        if os.path.isfile(name) or imgname in fnames:
            continue
        netutils.download_file(img, name, headers=hdr)
        zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
        try:
            os.remove(name)
        except FileNotFoundError or PermissionError:
            pass
        time.sleep(0.1)  # no don't penetrate
    added = len(zfile.infolist()) - len(fnames)
    print('[+] Added %s files to the zipfile' % added)
 def download_subreddit(sub: str, count: int =-1, out: str =None):
    mode = 'w'
    zname = sub + '.zip'
    if out:
        zname = out
    if os.path.isfile(zname):
        mode = 'a'
    url = 'https://old.reddit.com/r/%s/' % sub
    imgs = get_img4sub(url, length=count)
    zfile = zipfile.ZipFile(zname, mode)
    download_images(imgs, zfile)
    zfile.close()
 def cleanup():
    print('[~] Cleanup...')
    if os.path.isdir(dl_dir):
        shutil.rmtree(dl_dir)
 def parser_init():
    parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
    parser.add_option('-c', '--count', dest='count',
                      type='int', default=-1,
                      help='The number of images to download.')
    parser.add_option('-o', '--output', dest='output',
                      type='str', default=None,
                      help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')
    parser.add_option('-t', '--test', dest='test',
                      action='store_true', default=False,
                      help='Tests the functions of the script')
    parser.add_option('-l', '--loop', dest='loop',
                      action='store_true', default=False,
                      help="""Continuing download loop. When this option is set every 5 Minutes the program searches for
                      new images""")
    return parser.parse_args()
 def download_subreddits(subreddits, count, output):
    for sub in subreddits:
        print('[~] Downloading %s' % sub)
        download_subreddit(sub, count=count, out=output)
        print()
 def main():
    options, subreddits = parser_init()
    count = options.count
    output = options.output
    if options.test:
        count = 1
        subreddits = ['python']
        output = 'test.zip'
    if options.loop:
        while True:
            download_subreddits(subreddits, count, output)
            print('[~] Next Download in 5 minues...')
            time.sleep(300)
    else:
        download_subreddits(subreddits, count, output)
    cleanup()
    if options.test:
        os.remove(output)
    if len(errors.keys()) > 0:
        print('[-] Following errors occured:')
        for key in errors.keys():
            print('    %s times: %s' % (errors[key], key))
 if __name__ == '__main__':
    main()