Merge pull request #7 from Trivernis/develop

Develop
6 years ago · 2e132b0bf1
parent 451802d552 685ce33d9a
commit 2e132b0bf1
13 changed files with 491 additions and 10 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -45,11 +45,17 @@ jobs:
      # https://pytest.org
      # https://nose.readthedocs.io
      - run:
-          name: run tests
+          name: run tests for riddle.py
          command: |
            . venv/bin/activate
            python riddle.py -t Python

+      - run:
+          name: run tests for riddle2.py
+          command: |
+            . venv/bin/activate
+            python riddle2.py -t
+
      - store_artifacts:
          path: test-reports
          destination: test-reports
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
 *.zip
+*.log
+logs
 .idea
-.cache
+.cache
+__pycache__.*
+__pycache__
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ pip install -r requirements.txt
 ## Scripts

 ### riddle.py
-Downloads all images from one ore more given subreddits
+Downloads all images from one or more given subreddits
 ```commandline
 Usage: riddle.py [options] [subreddits]

@ -23,4 +23,23 @@ Options:
               images can be found. Do only activate this if you want to
               download a lot of images                       from multiple
               subreddits at the same time.
+```
+
+## riddle2.py
+Downloads all images from one  or more given subreddits in a more predictable
+ way than riddle.py.
+```commandline
+Usage: riddle2.py [options] [subreddits]
+
+Options:
+  -h, --help            show this help message and exit
+  -c COUNT, --count=COUNT
+                        The number of images to download.
+  -o OUTPUT, --output=OUTPUT
+                        The name of the output zipfile. If none is specified,
+                        it's the subreddits name.
+  -t, --test            Tests the functions of the script
+  -l, --loop            Continuing download loop. When this option is set
+                        every 5 Minutes the program searches for
+                        new images
 ```
--- a/conf/logging.config
+++ b/conf/logging.config
@ -0,0 +1,27 @@
+[loggers]
+keys=root
+
+[handlers]
+keys=stream_handler, file_handler
+
+[formatters]
+keys=formatter
+
+[logger_root]
+level=DEBUG
+handlers=stream_handler, file_handler
+
+[handler_stream_handler]
+class=StreamHandler
+level=FATAL
+formatter=formatter
+args=(sys.stderr,)
+
+[handler_file_handler]
+class=handlers.TimedRotatingFileHandler
+level=DEBUG
+formatter=formatter
+args=('./logs/utility.log','midnight',1,5,'utf-8',False,True,)
+
+[formatter_formatter]
+format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s
--- a/lib/cutils.py
+++ b/lib/cutils.py
@ -0,0 +1,31 @@
+
+class ProgressBar:
+    def __init__(self, total=100, prefix='', suffix='', length=50, fill='█'):
+        self.prefix = prefix
+        self.suffix = suffix
+        self.fill = fill
+        self.length = length
+        self.total = total
+        self.progress = 0
+
+    def tick(self):
+        self.progress += 1
+        self._print_progress()
+
+    def setprogress(self, progress):
+        self.progress = progress
+        self._print_progress()
+
+    def _print_progress(self):
+        iteration = self.progress
+        total = self.total
+        prefix = self.prefix
+        suffix = self.suffix
+
+        percent = ("{0:." + str(1) + "f}").format(100 * (iteration / float(total)))
+        filled_length = int(self.length * iteration // total)
+        bar = self.fill * filled_length + '-' * (self.length - filled_length)
+        print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
+        # Print New Line on Complete
+        if iteration == total:
+            print()
--- a/lib/fsutils.py
+++ b/lib/fsutils.py
@ -0,0 +1,70 @@
+import os
+import shutil
+
+
+def dir_exist_guarantee(dirpath):
+    if not os.path.exists(dirpath):
+        os.mkdir(dirpath)
+
+
+def get_extension(fname: str):
+    return fname.split('.')[-1]
+
+
+class FileInfo:
+    """ A simple wrapper around the os path functions that returns basic file info
+     and let's you peform basic file tasks."""
+    def __init__(self, fname: str):
+        self._init_info(fname)
+
+    def _init_info(self, fname):
+        """ Set's all the required variables for performing file tasks and to
+         access when working with the file object. """
+        # stringvars
+        self._path = os.path.normpath(fname.replace('\\', '/')).encode('utf-8')
+        if not os.path.isfile(self._path):
+            raise Exception("Not a File")
+        self._extless, self.extension = os.path.splitext(self._path)
+        self.dirname, self.basename = os.path.split(self._path)
+        self.fullname = os.path.join(self.dirname, self.basename)
+        # boolvars
+        self.exist = os.path.exists(self.fullname)
+        self.ismount = self.islink = False
+        if self.exist:
+            self.ismount = os.path.ismount(self.fullname)
+            self.islink = os.path.islink(self.fullname)
+
+    def delete(self):
+        """ Deletes the file if it exists.
+         Does nothing, if it does not exist."""
+        if self.exist:
+            os.remove(self.fullname)
+
+    def create(self):
+        """ Creates the file if it doesn't exist.
+         Does nothing, if it does."""
+        if not self.exist:
+            with open(self.fullname, 'w') as f:
+                f.write('');
+
+    def reset(self):
+        """ Opens the file and writes nothing into it. """
+        with open(self.fullname, 'w') as f:
+            f.write('')
+
+    def open(self, mode: str):
+        """ Returns the file opened with the open method. """
+        self.create()
+        return open(self.fullname, mode)
+
+    def copy(self, dest: str):
+        if self.exist:
+            shutil.copyfile(self.fullname, dest)
+            return FileInfo(dest)
+
+    def move(self, dest: str):
+        if self.exist:
+            shutil.move(self.fullname, dest)
+            self._init_info(dest)
+        else:
+            self._init_info(dest)
--- a/lib/logs/utility.log
+++ b/lib/logs/utility.log
--- a/lib/logutils.py
+++ b/lib/logutils.py
@ -0,0 +1,13 @@
+import logging
+from logging.config import fileConfig
+
+from lib import fsutils
+
+
+def get_logger(name=None):
+    if fsutils.os.path.isfile('./conf/logging.config'):
+        fsutils.dir_exist_guarantee('logs')
+        fileConfig('./conf/logging.config')
+    if name:
+        return logging.getLogger(name)
+    return logging.getLogger()
--- a/lib/netutils.py
+++ b/lib/netutils.py
@ -0,0 +1,37 @@
+import urllib.request as urlreq
+import time
+
+from bs4 import BeautifulSoup
+
+from lib import logutils
+
+logger = logutils.get_logger('netutils')
+
+
+def get_soup4url(url: str, retrys: int =2, headers: dict=urlreq.noheaders(), timeout: int =30) -> BeautifulSoup:
+    """ Returns a soup for the url """
+    req = urlreq.Request(url, headers=headers)
+    html = None
+    for _ in range(0, retrys+1):
+        try:
+            html = urlreq.urlopen(req, timeout=timeout).read()
+            break
+        except Exception as e:
+            logger.exception(e)
+            time.sleep(1)  # to avoid request flooding
+    if html:
+        soup = BeautifulSoup(html, "lxml")
+        return soup
+    return False
+
+
+def download_file(url: str, dest: str, headers: dict=urlreq.noheaders()):
+    f = open(dest, "wb")
+    req = urlreq.Request(url, headers=headers)
+    try:
+        image = urlreq.urlopen(req)
+    except ConnectionError:
+        print('\n [-] Connection Error')
+        return
+    f.write(image.read())
+    f.close()
--- a/logs/utility.log
+++ b/logs/utility.log
@ -0,0 +1,61 @@
+2018-11-20 11:15:43,247 netutils     ERROR    <urlopen error _ssl.c:830: The handshake operation timed out>
+Traceback (most recent call last):
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1318, in do_open
+    encode_chunked=req.has_header('Transfer-encoding'))
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1239, in request
+    self._send_request(method, url, body, headers, encode_chunked)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1285, in _send_request
+    self.endheaders(body, encode_chunked=encode_chunked)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1234, in endheaders
+    self._send_output(message_body, encode_chunked=encode_chunked)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1026, in _send_output
+    self.send(msg)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 964, in send
+    self.connect()
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1400, in connect
+    server_hostname=server_hostname)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 407, in wrap_socket
+    _context=self, _session=session)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 817, in __init__
+    self.do_handshake()
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1077, in do_handshake
+    self._sslobj.do_handshake()
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 689, in do_handshake
+    self._sslobj.do_handshake()
+socket.timeout: _ssl.c:830: The handshake operation timed out
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+  File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
+    html = urlreq.urlopen(req, timeout=timeout).read()
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 223, in urlopen
+    return opener.open(url, data, timeout)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 526, in open
+    response = self._open(req, data)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 544, in _open
+    '_open', req)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 504, in _call_chain
+    result = func(*args)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1361, in https_open
+    context=self._context, check_hostname=self._check_hostname)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1320, in do_open
+    raise URLError(err)
+urllib.error.URLError: <urlopen error _ssl.c:830: The handshake operation timed out>
+2018-11-20 14:11:39,064 netutils     ERROR    The read operation timed out
+Traceback (most recent call last):
+  File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
+    html = urlreq.urlopen(req, timeout=timeout).read()
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 462, in read
+    s = self._safe_read(self.length)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 612, in _safe_read
+    chunk = self.fp.read(min(amt, MAXAMOUNT))
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\socket.py", line 586, in readinto
+    return self._sock.recv_into(b)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1012, in recv_into
+    return self.read(nbytes, buffer)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 874, in read
+    return self._sslobj.read(len, buffer)
+  File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 631, in read
+    v = self._sslobj.read(len, buffer)
+socket.timeout: The read operation timed out
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
 beautifulsoup4==4.6.3
-bs4==0.0.1
 lxml==4.2.5
 typing==3.6.4
--- a/riddle.py
+++ b/riddle.py
@ -8,11 +8,11 @@ import optparse
 import asyncio
 import shutil

-redditurl: str = 'https://old.reddit.com/r/%s'
-dl_dir: str = './.cache/'  # Format must be ./
-img_ext: List[str] = ['jpg', 'png', 'bmp']
-blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com']
-hdr: Dict[str, str] = {
+redditurl: str = 'https://old.reddit.com/r/%s'      # the url for reddit with %s to insert the subreddit name
+dl_dir: str = './.cache/'  # Format must be ./      # the directory where files are cached. Will be created if it doesn't exist
+img_ext: List[str] = ['jpg', 'png', 'bmp']          # file extensions that are images
+blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com']   # where images shouldn't be downloaded from
+hdr: Dict[str, str] = {                             # request header
    'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 
                     Chrome/23.0.1271.64 Safari/537.11""",
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@ -20,6 +20,7 @@ hdr: Dict[str, str] = {
    'Connection': 'keep-alive'}


+# prints a progress bar
 def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filled_length = int(length * iteration // total)
@ -30,6 +31,7 @@ def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=10
        print()


+# returns a soup for the given url
 async def request_soup(url):
    req = urlreq.Request(url, headers=hdr)
    html = None
@ -44,6 +46,7 @@ async def request_soup(url):
    return soup


+# returns all images for the given url
 async def get_img_as(url):
    soup = await  request_soup(url)
    ret = []
@ -56,6 +59,7 @@ async def get_img_as(url):
    return ret


+# returns the last post id in the given reddit page
 async def get_next(url):
    ids = []
    soup = await request_soup(url)
@ -69,6 +73,7 @@ async def get_next(url):
    return [_id for _id in ids if _id][-1]


+# returns if the given tag has a source attribute that is an image
 def has_source(tag):
    if tag.has_attr('src'):
        try:
@ -85,6 +90,7 @@ def has_source(tag):
        return False


+# downloads all images for the given url and puts them in a zipfile
 async def download_async(url, zfile=None, test=False):
    images = await get_img_as(url)
    print('[+] Found %s images' % len(images))
@ -95,7 +101,7 @@ async def download_async(url, zfile=None, test=False):
    print_progress(count, imgcount, prefix="Downloading: ", suffix="Complete")
    for img in images:
        print_progress(count+1, imgcount, prefix="Downloading: ", suffix="Complete")
-        count+=1
+        count += 1
        if test:
            continue
        try:
@ -127,6 +133,7 @@ async def download_async(url, zfile=None, test=False):
    print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url))


+# loops over reddit-pages until no more images are found
 async def dl_loop(section, zfile, loop, chaos=False, test=False):
    baseurl = redditurl % section
    url = baseurl
@ -151,6 +158,7 @@ async def dl_loop(section, zfile, loop, chaos=False, test=False):
            await asyncio.sleep(0.1)


+# the main function
 def main(sections, opts):
    chaos = opts.chaos
    if not os.path.exists(dl_dir):
--- a/riddle2.py
+++ b/riddle2.py
@ -0,0 +1,206 @@
+import zipfile
+import time
+import os
+import sys
+import optparse
+import shutil
+
+from lib import cutils, netutils, fsutils
+
+blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
+dl_dir = './.cache/'
+img_ext = ['jpg', 'jpeg', 'png']    # define the urls we are searching for
+hdr = {                             # request header
+    'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 
+                     Chrome/23.0.1271.64 Safari/537.11""",
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
+    'Connection': 'keep-alive'}
+errors = {}
+
+
+def has_source(tag: netutils.BeautifulSoup) -> bool:
+    if tag.has_attr('src'):
+        try:
+            return fsutils.get_extension(tag['src']) in img_ext
+        except IndexError or KeyError:
+            return False
+    elif tag.has_attr('data-url'):
+        try:
+            tag['src'] = tag['data-url']
+            return fsutils.get_extension(tag['src']) in img_ext
+        except IndexError or KeyError:
+            return False
+    else:
+        return False
+
+
+def get_next_url(baseurl: str, url: str):
+    ids = []
+    soup = netutils.get_soup4url(url, headers=hdr)
+    if not soup:
+        return False
+    for t in soup.find_all(has_source):
+        if 'redditmedia' not in t['src']:
+            try:
+                fname = t['data-fullname']
+                ids.append(fname)
+            except KeyError:
+                pass
+    ids = [_id for _id in ids if _id]
+    if len(ids) == 0:  # if no id was found, we can't get any further into the past
+        return None
+    _id = ids[-1]
+    next_url = '{}/?after={}'.format(baseurl, _id)
+    return next_url
+
+
+def get_img4site(url: str) -> list:
+    soup = netutils.get_soup4url(url, headers=hdr)
+    if not soup:
+        return []
+    ret = []
+    sys.stdout.write('.')
+    sys.stdout.flush()
+    for t in soup.find_all(has_source):
+        try:
+            if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
+                img = t['src']
+                if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
+                    img = url + img
+                if 'http' not in img.split('/')[0]:
+                    img = 'http:' + img
+                if img.strip('http://').strip('https://').split('/')[0] in blacklist:
+                    img = None
+                if img:
+                    ret.append(img)
+        except KeyError:
+            pass
+    return ret
+
+
+def get_img4sub(url: str, length: int =-1) -> list:
+    baseurl = url
+    imgs = []
+    print('[~] 1/2 Getting images...')
+    if length >= 0:
+        x = 0
+        while x < length:
+            time.sleep(0.1)  # we don't want to flood with requests
+            imgurls = get_img4site(url)
+            if not imgurls:
+                break
+            imgs.extend(imgurls)
+            x = len(imgs)
+            url = get_next_url(baseurl, url)
+            if not url:
+                break
+            sys.stdout.write('\b')
+        imgs = imgs[:length]
+    else:
+        while url:
+            time.sleep(0.1)  # we don't want to flood with requests
+            imgurls = get_img4site(url)
+            if not imgurls:
+                break
+            imgs.extend(imgurls)
+            url = get_next_url(baseurl, url)
+    print()
+    print('[+] Found %s images' % len(imgs))
+    return imgs
+
+
+def download_images(imgs: list, zfile: zipfile.ZipFile):
+    imgcount = len(imgs)
+    fnames = [zinfo.filename for zinfo in zfile.infolist()]
+    print('[~] 2/2 Downloading %s images' % imgcount)
+    pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete")
+    fsutils.dir_exist_guarantee(dl_dir)
+    for img in imgs:
+        pb.tick()
+        imgname = img.split('/')[-1]
+        name = os.path.join(dl_dir, imgname)
+        if os.path.isfile(name) or imgname in fnames:
+            continue
+        netutils.download_file(img, name, headers=hdr)
+        zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
+        try:
+            os.remove(name)
+        except FileNotFoundError or PermissionError:
+            pass
+        time.sleep(0.1)  # no don't penetrate
+    added = len(zfile.infolist()) - len(fnames)
+    print('[+] Added %s files to the zipfile' % added)
+
+
+def download_subreddit(sub: str, count: int =-1, out: str =None):
+    mode = 'w'
+    zname = sub + '.zip'
+    if out:
+        zname = out
+    if os.path.isfile(zname):
+        mode = 'a'
+    url = 'https://old.reddit.com/r/%s/' % sub
+    imgs = get_img4sub(url, length=count)
+    zfile = zipfile.ZipFile(zname, mode)
+    download_images(imgs, zfile)
+    zfile.close()
+
+
+def cleanup():
+    print('[~] Cleanup...')
+    if os.path.isdir(dl_dir):
+        shutil.rmtree(dl_dir)
+
+
+def parser_init():
+    parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
+    parser.add_option('-c', '--count', dest='count',
+                      type='int', default=-1,
+                      help='The number of images to download.')
+    parser.add_option('-o', '--output', dest='output',
+                      type='str', default=None,
+                      help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')
+    parser.add_option('-t', '--test', dest='test',
+                      action='store_true', default=False,
+                      help='Tests the functions of the script')
+    parser.add_option('-l', '--loop', dest='loop',
+                      action='store_true', default=False,
+                      help="""Continuing download loop. When this option is set every 5 Minutes the program searches for
+                      new images""")
+    return parser.parse_args()
+
+
+def download_subreddits(subreddits, count, output):
+    for sub in subreddits:
+        print('[~] Downloading %s' % sub)
+        download_subreddit(sub, count=count, out=output)
+        print()
+
+
+def main():
+    options, subreddits = parser_init()
+    count = options.count
+    output = options.output
+    if options.test:
+        count = 1
+        subreddits = ['python']
+        output = 'test.zip'
+    if options.loop:
+        while True:
+            download_subreddits(subreddits, count, output)
+            print('[~] Next Download in 5 minues...')
+            time.sleep(300)
+    else:
+        download_subreddits(subreddits, count, output)
+    cleanup()
+    if options.test:
+        os.remove(output)
+    if len(errors.keys()) > 0:
+        print('[-] Following errors occured:')
+        for key in errors.keys():
+            print('    %s times: %s' % (errors[key], key))
+
+
+if __name__ == '__main__':
+    main()