Merge pull request #7 from Trivernis/develop

Develop
pull/9/head
Trivernis 6 years ago committed by GitHub
commit 2e132b0bf1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -45,11 +45,17 @@ jobs:
# https://pytest.org # https://pytest.org
# https://nose.readthedocs.io # https://nose.readthedocs.io
- run: - run:
name: run tests name: run tests for riddle.py
command: | command: |
. venv/bin/activate . venv/bin/activate
python riddle.py -t Python python riddle.py -t Python
- run:
name: run tests for riddle2.py
command: |
. venv/bin/activate
python riddle2.py -t
- store_artifacts: - store_artifacts:
path: test-reports path: test-reports
destination: test-reports destination: test-reports

4
.gitignore vendored

@ -1,3 +1,7 @@
*.zip *.zip
*.log
logs
.idea .idea
.cache .cache
__pycache__.*
__pycache__

@ -12,7 +12,7 @@ pip install -r requirements.txt
## Scripts ## Scripts
### riddle.py ### riddle.py
Downloads all images from one ore more given subreddits Downloads all images from one or more given subreddits
```commandline ```commandline
Usage: riddle.py [options] [subreddits] Usage: riddle.py [options] [subreddits]
@ -24,3 +24,22 @@ Options:
download a lot of images from multiple download a lot of images from multiple
subreddits at the same time. subreddits at the same time.
``` ```
## riddle2.py
Downloads all images from one or more given subreddits in a more predictable
way than riddle.py.
```commandline
Usage: riddle2.py [options] [subreddits]
Options:
-h, --help show this help message and exit
-c COUNT, --count=COUNT
The number of images to download.
-o OUTPUT, --output=OUTPUT
The name of the output zipfile. If none is specified,
it's the subreddits name.
-t, --test Tests the functions of the script
-l, --loop Continuing download loop. When this option is set
every 5 Minutes the program searches for
new images
```

@ -0,0 +1,27 @@
[loggers]
keys=root
[handlers]
keys=stream_handler, file_handler
[formatters]
keys=formatter
[logger_root]
level=DEBUG
handlers=stream_handler, file_handler
[handler_stream_handler]
class=StreamHandler
level=FATAL
formatter=formatter
args=(sys.stderr,)
[handler_file_handler]
class=handlers.TimedRotatingFileHandler
level=DEBUG
formatter=formatter
args=('./logs/utility.log','midnight',1,5,'utf-8',False,True,)
[formatter_formatter]
format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s

@ -0,0 +1,31 @@
class ProgressBar:
def __init__(self, total=100, prefix='', suffix='', length=50, fill=''):
self.prefix = prefix
self.suffix = suffix
self.fill = fill
self.length = length
self.total = total
self.progress = 0
def tick(self):
self.progress += 1
self._print_progress()
def setprogress(self, progress):
self.progress = progress
self._print_progress()
def _print_progress(self):
iteration = self.progress
total = self.total
prefix = self.prefix
suffix = self.suffix
percent = ("{0:." + str(1) + "f}").format(100 * (iteration / float(total)))
filled_length = int(self.length * iteration // total)
bar = self.fill * filled_length + '-' * (self.length - filled_length)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
# Print New Line on Complete
if iteration == total:
print()

@ -0,0 +1,70 @@
import os
import shutil
def dir_exist_guarantee(dirpath):
if not os.path.exists(dirpath):
os.mkdir(dirpath)
def get_extension(fname: str):
return fname.split('.')[-1]
class FileInfo:
""" A simple wrapper around the os path functions that returns basic file info
and let's you peform basic file tasks."""
def __init__(self, fname: str):
self._init_info(fname)
def _init_info(self, fname):
""" Set's all the required variables for performing file tasks and to
access when working with the file object. """
# stringvars
self._path = os.path.normpath(fname.replace('\\', '/')).encode('utf-8')
if not os.path.isfile(self._path):
raise Exception("Not a File")
self._extless, self.extension = os.path.splitext(self._path)
self.dirname, self.basename = os.path.split(self._path)
self.fullname = os.path.join(self.dirname, self.basename)
# boolvars
self.exist = os.path.exists(self.fullname)
self.ismount = self.islink = False
if self.exist:
self.ismount = os.path.ismount(self.fullname)
self.islink = os.path.islink(self.fullname)
def delete(self):
""" Deletes the file if it exists.
Does nothing, if it does not exist."""
if self.exist:
os.remove(self.fullname)
def create(self):
""" Creates the file if it doesn't exist.
Does nothing, if it does."""
if not self.exist:
with open(self.fullname, 'w') as f:
f.write('');
def reset(self):
""" Opens the file and writes nothing into it. """
with open(self.fullname, 'w') as f:
f.write('')
def open(self, mode: str):
""" Returns the file opened with the open method. """
self.create()
return open(self.fullname, mode)
def copy(self, dest: str):
if self.exist:
shutil.copyfile(self.fullname, dest)
return FileInfo(dest)
def move(self, dest: str):
if self.exist:
shutil.move(self.fullname, dest)
self._init_info(dest)
else:
self._init_info(dest)

@ -0,0 +1,13 @@
import logging
from logging.config import fileConfig
from lib import fsutils
def get_logger(name=None):
if fsutils.os.path.isfile('./conf/logging.config'):
fsutils.dir_exist_guarantee('logs')
fileConfig('./conf/logging.config')
if name:
return logging.getLogger(name)
return logging.getLogger()

@ -0,0 +1,37 @@
import urllib.request as urlreq
import time
from bs4 import BeautifulSoup
from lib import logutils
logger = logutils.get_logger('netutils')
def get_soup4url(url: str, retrys: int =2, headers: dict=urlreq.noheaders(), timeout: int =30) -> BeautifulSoup:
""" Returns a soup for the url """
req = urlreq.Request(url, headers=headers)
html = None
for _ in range(0, retrys+1):
try:
html = urlreq.urlopen(req, timeout=timeout).read()
break
except Exception as e:
logger.exception(e)
time.sleep(1) # to avoid request flooding
if html:
soup = BeautifulSoup(html, "lxml")
return soup
return False
def download_file(url: str, dest: str, headers: dict=urlreq.noheaders()):
f = open(dest, "wb")
req = urlreq.Request(url, headers=headers)
try:
image = urlreq.urlopen(req)
except ConnectionError:
print('\n [-] Connection Error')
return
f.write(image.read())
f.close()

@ -0,0 +1,61 @@
2018-11-20 11:15:43,247 netutils ERROR <urlopen error _ssl.c:830: The handshake operation timed out>
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1026, in _send_output
self.send(msg)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 964, in send
self.connect()
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1400, in connect
server_hostname=server_hostname)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 407, in wrap_socket
_context=self, _session=session)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 817, in __init__
self.do_handshake()
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1077, in do_handshake
self._sslobj.do_handshake()
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 689, in do_handshake
self._sslobj.do_handshake()
socket.timeout: _ssl.c:830: The handshake operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
html = urlreq.urlopen(req, timeout=timeout).read()
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 526, in open
response = self._open(req, data)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 544, in _open
'_open', req)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1320, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error _ssl.c:830: The handshake operation timed out>
2018-11-20 14:11:39,064 netutils ERROR The read operation timed out
Traceback (most recent call last):
File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
html = urlreq.urlopen(req, timeout=timeout).read()
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 462, in read
s = self._safe_read(self.length)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 612, in _safe_read
chunk = self.fp.read(min(amt, MAXAMOUNT))
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1012, in recv_into
return self.read(nbytes, buffer)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 874, in read
return self._sslobj.read(len, buffer)
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 631, in read
v = self._sslobj.read(len, buffer)
socket.timeout: The read operation timed out

@ -1,4 +1,3 @@
beautifulsoup4==4.6.3 beautifulsoup4==4.6.3
bs4==0.0.1
lxml==4.2.5 lxml==4.2.5
typing==3.6.4 typing==3.6.4

@ -8,11 +8,11 @@ import optparse
import asyncio import asyncio
import shutil import shutil
redditurl: str = 'https://old.reddit.com/r/%s' redditurl: str = 'https://old.reddit.com/r/%s' # the url for reddit with %s to insert the subreddit name
dl_dir: str = './.cache/' # Format must be ./ dl_dir: str = './.cache/' # Format must be ./ # the directory where files are cached. Will be created if it doesn't exist
img_ext: List[str] = ['jpg', 'png', 'bmp'] img_ext: List[str] = ['jpg', 'png', 'bmp'] # file extensions that are images
blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com'] blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com'] # where images shouldn't be downloaded from
hdr: Dict[str, str] = { hdr: Dict[str, str] = { # request header
'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) 'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)
Chrome/23.0.1271.64 Safari/537.11""", Chrome/23.0.1271.64 Safari/537.11""",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@ -20,6 +20,7 @@ hdr: Dict[str, str] = {
'Connection': 'keep-alive'} 'Connection': 'keep-alive'}
# prints a progress bar
def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill=''): def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill=''):
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filled_length = int(length * iteration // total) filled_length = int(length * iteration // total)
@ -30,6 +31,7 @@ def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=10
print() print()
# returns a soup for the given url
async def request_soup(url): async def request_soup(url):
req = urlreq.Request(url, headers=hdr) req = urlreq.Request(url, headers=hdr)
html = None html = None
@ -44,6 +46,7 @@ async def request_soup(url):
return soup return soup
# returns all images for the given url
async def get_img_as(url): async def get_img_as(url):
soup = await request_soup(url) soup = await request_soup(url)
ret = [] ret = []
@ -56,6 +59,7 @@ async def get_img_as(url):
return ret return ret
# returns the last post id in the given reddit page
async def get_next(url): async def get_next(url):
ids = [] ids = []
soup = await request_soup(url) soup = await request_soup(url)
@ -69,6 +73,7 @@ async def get_next(url):
return [_id for _id in ids if _id][-1] return [_id for _id in ids if _id][-1]
# returns if the given tag has a source attribute that is an image
def has_source(tag): def has_source(tag):
if tag.has_attr('src'): if tag.has_attr('src'):
try: try:
@ -85,6 +90,7 @@ def has_source(tag):
return False return False
# downloads all images for the given url and puts them in a zipfile
async def download_async(url, zfile=None, test=False): async def download_async(url, zfile=None, test=False):
images = await get_img_as(url) images = await get_img_as(url)
print('[+] Found %s images' % len(images)) print('[+] Found %s images' % len(images))
@ -127,6 +133,7 @@ async def download_async(url, zfile=None, test=False):
print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url)) print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url))
# loops over reddit-pages until no more images are found
async def dl_loop(section, zfile, loop, chaos=False, test=False): async def dl_loop(section, zfile, loop, chaos=False, test=False):
baseurl = redditurl % section baseurl = redditurl % section
url = baseurl url = baseurl
@ -151,6 +158,7 @@ async def dl_loop(section, zfile, loop, chaos=False, test=False):
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
# the main function
def main(sections, opts): def main(sections, opts):
chaos = opts.chaos chaos = opts.chaos
if not os.path.exists(dl_dir): if not os.path.exists(dl_dir):

@ -0,0 +1,206 @@
import zipfile
import time
import os
import sys
import optparse
import shutil
from lib import cutils, netutils, fsutils
blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
dl_dir = './.cache/'
img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for
hdr = { # request header
'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)
Chrome/23.0.1271.64 Safari/537.11""",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
errors = {}
def has_source(tag: netutils.BeautifulSoup) -> bool:
if tag.has_attr('src'):
try:
return fsutils.get_extension(tag['src']) in img_ext
except IndexError or KeyError:
return False
elif tag.has_attr('data-url'):
try:
tag['src'] = tag['data-url']
return fsutils.get_extension(tag['src']) in img_ext
except IndexError or KeyError:
return False
else:
return False
def get_next_url(baseurl: str, url: str):
ids = []
soup = netutils.get_soup4url(url, headers=hdr)
if not soup:
return False
for t in soup.find_all(has_source):
if 'redditmedia' not in t['src']:
try:
fname = t['data-fullname']
ids.append(fname)
except KeyError:
pass
ids = [_id for _id in ids if _id]
if len(ids) == 0: # if no id was found, we can't get any further into the past
return None
_id = ids[-1]
next_url = '{}/?after={}'.format(baseurl, _id)
return next_url
def get_img4site(url: str) -> list:
soup = netutils.get_soup4url(url, headers=hdr)
if not soup:
return []
ret = []
sys.stdout.write('.')
sys.stdout.flush()
for t in soup.find_all(has_source):
try:
if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
img = t['src']
if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
img = url + img
if 'http' not in img.split('/')[0]:
img = 'http:' + img
if img.strip('http://').strip('https://').split('/')[0] in blacklist:
img = None
if img:
ret.append(img)
except KeyError:
pass
return ret
def get_img4sub(url: str, length: int =-1) -> list:
baseurl = url
imgs = []
print('[~] 1/2 Getting images...')
if length >= 0:
x = 0
while x < length:
time.sleep(0.1) # we don't want to flood with requests
imgurls = get_img4site(url)
if not imgurls:
break
imgs.extend(imgurls)
x = len(imgs)
url = get_next_url(baseurl, url)
if not url:
break
sys.stdout.write('\b')
imgs = imgs[:length]
else:
while url:
time.sleep(0.1) # we don't want to flood with requests
imgurls = get_img4site(url)
if not imgurls:
break
imgs.extend(imgurls)
url = get_next_url(baseurl, url)
print()
print('[+] Found %s images' % len(imgs))
return imgs
def download_images(imgs: list, zfile: zipfile.ZipFile):
imgcount = len(imgs)
fnames = [zinfo.filename for zinfo in zfile.infolist()]
print('[~] 2/2 Downloading %s images' % imgcount)
pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete")
fsutils.dir_exist_guarantee(dl_dir)
for img in imgs:
pb.tick()
imgname = img.split('/')[-1]
name = os.path.join(dl_dir, imgname)
if os.path.isfile(name) or imgname in fnames:
continue
netutils.download_file(img, name, headers=hdr)
zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
try:
os.remove(name)
except FileNotFoundError or PermissionError:
pass
time.sleep(0.1) # no don't penetrate
added = len(zfile.infolist()) - len(fnames)
print('[+] Added %s files to the zipfile' % added)
def download_subreddit(sub: str, count: int =-1, out: str =None):
mode = 'w'
zname = sub + '.zip'
if out:
zname = out
if os.path.isfile(zname):
mode = 'a'
url = 'https://old.reddit.com/r/%s/' % sub
imgs = get_img4sub(url, length=count)
zfile = zipfile.ZipFile(zname, mode)
download_images(imgs, zfile)
zfile.close()
def cleanup():
print('[~] Cleanup...')
if os.path.isdir(dl_dir):
shutil.rmtree(dl_dir)
def parser_init():
parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
parser.add_option('-c', '--count', dest='count',
type='int', default=-1,
help='The number of images to download.')
parser.add_option('-o', '--output', dest='output',
type='str', default=None,
help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')
parser.add_option('-t', '--test', dest='test',
action='store_true', default=False,
help='Tests the functions of the script')
parser.add_option('-l', '--loop', dest='loop',
action='store_true', default=False,
help="""Continuing download loop. When this option is set every 5 Minutes the program searches for
new images""")
return parser.parse_args()
def download_subreddits(subreddits, count, output):
for sub in subreddits:
print('[~] Downloading %s' % sub)
download_subreddit(sub, count=count, out=output)
print()
def main():
options, subreddits = parser_init()
count = options.count
output = options.output
if options.test:
count = 1
subreddits = ['python']
output = 'test.zip'
if options.loop:
while True:
download_subreddits(subreddits, count, output)
print('[~] Next Download in 5 minues...')
time.sleep(300)
else:
download_subreddits(subreddits, count, output)
cleanup()
if options.test:
os.remove(output)
if len(errors.keys()) > 0:
print('[-] Following errors occured:')
for key in errors.keys():
print(' %s times: %s' % (errors[key], key))
if __name__ == '__main__':
main()
Loading…
Cancel
Save