commit
2e132b0bf1
@ -1,3 +1,7 @@
|
||||
*.zip
|
||||
*.log
|
||||
logs
|
||||
.idea
|
||||
.cache
|
||||
__pycache__.*
|
||||
__pycache__
|
@ -0,0 +1,27 @@
|
||||
[loggers]
|
||||
keys=root
|
||||
|
||||
[handlers]
|
||||
keys=stream_handler, file_handler
|
||||
|
||||
[formatters]
|
||||
keys=formatter
|
||||
|
||||
[logger_root]
|
||||
level=DEBUG
|
||||
handlers=stream_handler, file_handler
|
||||
|
||||
[handler_stream_handler]
|
||||
class=StreamHandler
|
||||
level=FATAL
|
||||
formatter=formatter
|
||||
args=(sys.stderr,)
|
||||
|
||||
[handler_file_handler]
|
||||
class=handlers.TimedRotatingFileHandler
|
||||
level=DEBUG
|
||||
formatter=formatter
|
||||
args=('./logs/utility.log','midnight',1,5,'utf-8',False,True,)
|
||||
|
||||
[formatter_formatter]
|
||||
format=%(asctime)s %(name)-12s %(levelname)-8s %(message)s
|
@ -0,0 +1,31 @@
|
||||
|
||||
class ProgressBar:
|
||||
def __init__(self, total=100, prefix='', suffix='', length=50, fill='█'):
|
||||
self.prefix = prefix
|
||||
self.suffix = suffix
|
||||
self.fill = fill
|
||||
self.length = length
|
||||
self.total = total
|
||||
self.progress = 0
|
||||
|
||||
def tick(self):
|
||||
self.progress += 1
|
||||
self._print_progress()
|
||||
|
||||
def setprogress(self, progress):
|
||||
self.progress = progress
|
||||
self._print_progress()
|
||||
|
||||
def _print_progress(self):
|
||||
iteration = self.progress
|
||||
total = self.total
|
||||
prefix = self.prefix
|
||||
suffix = self.suffix
|
||||
|
||||
percent = ("{0:." + str(1) + "f}").format(100 * (iteration / float(total)))
|
||||
filled_length = int(self.length * iteration // total)
|
||||
bar = self.fill * filled_length + '-' * (self.length - filled_length)
|
||||
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end='\r')
|
||||
# Print New Line on Complete
|
||||
if iteration == total:
|
||||
print()
|
@ -0,0 +1,70 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
def dir_exist_guarantee(dirpath):
|
||||
if not os.path.exists(dirpath):
|
||||
os.mkdir(dirpath)
|
||||
|
||||
|
||||
def get_extension(fname: str):
|
||||
return fname.split('.')[-1]
|
||||
|
||||
|
||||
class FileInfo:
|
||||
""" A simple wrapper around the os path functions that returns basic file info
|
||||
and let's you peform basic file tasks."""
|
||||
def __init__(self, fname: str):
|
||||
self._init_info(fname)
|
||||
|
||||
def _init_info(self, fname):
|
||||
""" Set's all the required variables for performing file tasks and to
|
||||
access when working with the file object. """
|
||||
# stringvars
|
||||
self._path = os.path.normpath(fname.replace('\\', '/')).encode('utf-8')
|
||||
if not os.path.isfile(self._path):
|
||||
raise Exception("Not a File")
|
||||
self._extless, self.extension = os.path.splitext(self._path)
|
||||
self.dirname, self.basename = os.path.split(self._path)
|
||||
self.fullname = os.path.join(self.dirname, self.basename)
|
||||
# boolvars
|
||||
self.exist = os.path.exists(self.fullname)
|
||||
self.ismount = self.islink = False
|
||||
if self.exist:
|
||||
self.ismount = os.path.ismount(self.fullname)
|
||||
self.islink = os.path.islink(self.fullname)
|
||||
|
||||
def delete(self):
|
||||
""" Deletes the file if it exists.
|
||||
Does nothing, if it does not exist."""
|
||||
if self.exist:
|
||||
os.remove(self.fullname)
|
||||
|
||||
def create(self):
|
||||
""" Creates the file if it doesn't exist.
|
||||
Does nothing, if it does."""
|
||||
if not self.exist:
|
||||
with open(self.fullname, 'w') as f:
|
||||
f.write('');
|
||||
|
||||
def reset(self):
|
||||
""" Opens the file and writes nothing into it. """
|
||||
with open(self.fullname, 'w') as f:
|
||||
f.write('')
|
||||
|
||||
def open(self, mode: str):
|
||||
""" Returns the file opened with the open method. """
|
||||
self.create()
|
||||
return open(self.fullname, mode)
|
||||
|
||||
def copy(self, dest: str):
|
||||
if self.exist:
|
||||
shutil.copyfile(self.fullname, dest)
|
||||
return FileInfo(dest)
|
||||
|
||||
def move(self, dest: str):
|
||||
if self.exist:
|
||||
shutil.move(self.fullname, dest)
|
||||
self._init_info(dest)
|
||||
else:
|
||||
self._init_info(dest)
|
@ -0,0 +1,13 @@
|
||||
import logging
|
||||
from logging.config import fileConfig
|
||||
|
||||
from lib import fsutils
|
||||
|
||||
|
||||
def get_logger(name=None):
|
||||
if fsutils.os.path.isfile('./conf/logging.config'):
|
||||
fsutils.dir_exist_guarantee('logs')
|
||||
fileConfig('./conf/logging.config')
|
||||
if name:
|
||||
return logging.getLogger(name)
|
||||
return logging.getLogger()
|
@ -0,0 +1,37 @@
|
||||
import urllib.request as urlreq
|
||||
import time
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from lib import logutils
|
||||
|
||||
logger = logutils.get_logger('netutils')
|
||||
|
||||
|
||||
def get_soup4url(url: str, retrys: int =2, headers: dict=urlreq.noheaders(), timeout: int =30) -> BeautifulSoup:
|
||||
""" Returns a soup for the url """
|
||||
req = urlreq.Request(url, headers=headers)
|
||||
html = None
|
||||
for _ in range(0, retrys+1):
|
||||
try:
|
||||
html = urlreq.urlopen(req, timeout=timeout).read()
|
||||
break
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
time.sleep(1) # to avoid request flooding
|
||||
if html:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
return soup
|
||||
return False
|
||||
|
||||
|
||||
def download_file(url: str, dest: str, headers: dict=urlreq.noheaders()):
|
||||
f = open(dest, "wb")
|
||||
req = urlreq.Request(url, headers=headers)
|
||||
try:
|
||||
image = urlreq.urlopen(req)
|
||||
except ConnectionError:
|
||||
print('\n [-] Connection Error')
|
||||
return
|
||||
f.write(image.read())
|
||||
f.close()
|
@ -0,0 +1,61 @@
|
||||
2018-11-20 11:15:43,247 netutils ERROR <urlopen error _ssl.c:830: The handshake operation timed out>
|
||||
Traceback (most recent call last):
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1318, in do_open
|
||||
encode_chunked=req.has_header('Transfer-encoding'))
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1239, in request
|
||||
self._send_request(method, url, body, headers, encode_chunked)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1285, in _send_request
|
||||
self.endheaders(body, encode_chunked=encode_chunked)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1234, in endheaders
|
||||
self._send_output(message_body, encode_chunked=encode_chunked)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1026, in _send_output
|
||||
self.send(msg)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 964, in send
|
||||
self.connect()
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 1400, in connect
|
||||
server_hostname=server_hostname)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 407, in wrap_socket
|
||||
_context=self, _session=session)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 817, in __init__
|
||||
self.do_handshake()
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1077, in do_handshake
|
||||
self._sslobj.do_handshake()
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 689, in do_handshake
|
||||
self._sslobj.do_handshake()
|
||||
socket.timeout: _ssl.c:830: The handshake operation timed out
|
||||
|
||||
During handling of the above exception, another exception occurred:
|
||||
|
||||
Traceback (most recent call last):
|
||||
File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
|
||||
html = urlreq.urlopen(req, timeout=timeout).read()
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 223, in urlopen
|
||||
return opener.open(url, data, timeout)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 526, in open
|
||||
response = self._open(req, data)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 544, in _open
|
||||
'_open', req)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 504, in _call_chain
|
||||
result = func(*args)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1361, in https_open
|
||||
context=self._context, check_hostname=self._check_hostname)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\urllib\request.py", line 1320, in do_open
|
||||
raise URLError(err)
|
||||
urllib.error.URLError: <urlopen error _ssl.c:830: The handshake operation timed out>
|
||||
2018-11-20 14:11:39,064 netutils ERROR The read operation timed out
|
||||
Traceback (most recent call last):
|
||||
File "C:\Users\dev\Documents\Projekte\python-utility-scripts\lib\netutils.py", line 15, in get_soup4url
|
||||
html = urlreq.urlopen(req, timeout=timeout).read()
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 462, in read
|
||||
s = self._safe_read(self.length)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\http\client.py", line 612, in _safe_read
|
||||
chunk = self.fp.read(min(amt, MAXAMOUNT))
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\socket.py", line 586, in readinto
|
||||
return self._sock.recv_into(b)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 1012, in recv_into
|
||||
return self.read(nbytes, buffer)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 874, in read
|
||||
return self._sslobj.read(len, buffer)
|
||||
File "C:\ProgramData\Anaconda3\envs\python-utility-scripts\lib\ssl.py", line 631, in read
|
||||
v = self._sslobj.read(len, buffer)
|
||||
socket.timeout: The read operation timed out
|
@ -1,4 +1,3 @@
|
||||
beautifulsoup4==4.6.3
|
||||
bs4==0.0.1
|
||||
lxml==4.2.5
|
||||
typing==3.6.4
|
@ -0,0 +1,206 @@
|
||||
import zipfile
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
import optparse
|
||||
import shutil
|
||||
|
||||
from lib import cutils, netutils, fsutils
|
||||
|
||||
blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
|
||||
dl_dir = './.cache/'
|
||||
img_ext = ['jpg', 'jpeg', 'png'] # define the urls we are searching for
|
||||
hdr = { # request header
|
||||
'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)
|
||||
Chrome/23.0.1271.64 Safari/537.11""",
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
|
||||
'Connection': 'keep-alive'}
|
||||
errors = {}
|
||||
|
||||
|
||||
def has_source(tag: netutils.BeautifulSoup) -> bool:
|
||||
if tag.has_attr('src'):
|
||||
try:
|
||||
return fsutils.get_extension(tag['src']) in img_ext
|
||||
except IndexError or KeyError:
|
||||
return False
|
||||
elif tag.has_attr('data-url'):
|
||||
try:
|
||||
tag['src'] = tag['data-url']
|
||||
return fsutils.get_extension(tag['src']) in img_ext
|
||||
except IndexError or KeyError:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def get_next_url(baseurl: str, url: str):
|
||||
ids = []
|
||||
soup = netutils.get_soup4url(url, headers=hdr)
|
||||
if not soup:
|
||||
return False
|
||||
for t in soup.find_all(has_source):
|
||||
if 'redditmedia' not in t['src']:
|
||||
try:
|
||||
fname = t['data-fullname']
|
||||
ids.append(fname)
|
||||
except KeyError:
|
||||
pass
|
||||
ids = [_id for _id in ids if _id]
|
||||
if len(ids) == 0: # if no id was found, we can't get any further into the past
|
||||
return None
|
||||
_id = ids[-1]
|
||||
next_url = '{}/?after={}'.format(baseurl, _id)
|
||||
return next_url
|
||||
|
||||
|
||||
def get_img4site(url: str) -> list:
|
||||
soup = netutils.get_soup4url(url, headers=hdr)
|
||||
if not soup:
|
||||
return []
|
||||
ret = []
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
for t in soup.find_all(has_source):
|
||||
try:
|
||||
if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
|
||||
img = t['src']
|
||||
if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
|
||||
img = url + img
|
||||
if 'http' not in img.split('/')[0]:
|
||||
img = 'http:' + img
|
||||
if img.strip('http://').strip('https://').split('/')[0] in blacklist:
|
||||
img = None
|
||||
if img:
|
||||
ret.append(img)
|
||||
except KeyError:
|
||||
pass
|
||||
return ret
|
||||
|
||||
|
||||
def get_img4sub(url: str, length: int =-1) -> list:
|
||||
baseurl = url
|
||||
imgs = []
|
||||
print('[~] 1/2 Getting images...')
|
||||
if length >= 0:
|
||||
x = 0
|
||||
while x < length:
|
||||
time.sleep(0.1) # we don't want to flood with requests
|
||||
imgurls = get_img4site(url)
|
||||
if not imgurls:
|
||||
break
|
||||
imgs.extend(imgurls)
|
||||
x = len(imgs)
|
||||
url = get_next_url(baseurl, url)
|
||||
if not url:
|
||||
break
|
||||
sys.stdout.write('\b')
|
||||
imgs = imgs[:length]
|
||||
else:
|
||||
while url:
|
||||
time.sleep(0.1) # we don't want to flood with requests
|
||||
imgurls = get_img4site(url)
|
||||
if not imgurls:
|
||||
break
|
||||
imgs.extend(imgurls)
|
||||
url = get_next_url(baseurl, url)
|
||||
print()
|
||||
print('[+] Found %s images' % len(imgs))
|
||||
return imgs
|
||||
|
||||
|
||||
def download_images(imgs: list, zfile: zipfile.ZipFile):
|
||||
imgcount = len(imgs)
|
||||
fnames = [zinfo.filename for zinfo in zfile.infolist()]
|
||||
print('[~] 2/2 Downloading %s images' % imgcount)
|
||||
pb = cutils.ProgressBar(total=imgcount, prefix="[~] 2/2 Downloadinng", suffix="Complete")
|
||||
fsutils.dir_exist_guarantee(dl_dir)
|
||||
for img in imgs:
|
||||
pb.tick()
|
||||
imgname = img.split('/')[-1]
|
||||
name = os.path.join(dl_dir, imgname)
|
||||
if os.path.isfile(name) or imgname in fnames:
|
||||
continue
|
||||
netutils.download_file(img, name, headers=hdr)
|
||||
zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
|
||||
try:
|
||||
os.remove(name)
|
||||
except FileNotFoundError or PermissionError:
|
||||
pass
|
||||
time.sleep(0.1) # no don't penetrate
|
||||
added = len(zfile.infolist()) - len(fnames)
|
||||
print('[+] Added %s files to the zipfile' % added)
|
||||
|
||||
|
||||
def download_subreddit(sub: str, count: int =-1, out: str =None):
|
||||
mode = 'w'
|
||||
zname = sub + '.zip'
|
||||
if out:
|
||||
zname = out
|
||||
if os.path.isfile(zname):
|
||||
mode = 'a'
|
||||
url = 'https://old.reddit.com/r/%s/' % sub
|
||||
imgs = get_img4sub(url, length=count)
|
||||
zfile = zipfile.ZipFile(zname, mode)
|
||||
download_images(imgs, zfile)
|
||||
zfile.close()
|
||||
|
||||
|
||||
def cleanup():
|
||||
print('[~] Cleanup...')
|
||||
if os.path.isdir(dl_dir):
|
||||
shutil.rmtree(dl_dir)
|
||||
|
||||
|
||||
def parser_init():
|
||||
parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
|
||||
parser.add_option('-c', '--count', dest='count',
|
||||
type='int', default=-1,
|
||||
help='The number of images to download.')
|
||||
parser.add_option('-o', '--output', dest='output',
|
||||
type='str', default=None,
|
||||
help='The name of the output zipfile. If none is specified, it\'s the subreddits name.')
|
||||
parser.add_option('-t', '--test', dest='test',
|
||||
action='store_true', default=False,
|
||||
help='Tests the functions of the script')
|
||||
parser.add_option('-l', '--loop', dest='loop',
|
||||
action='store_true', default=False,
|
||||
help="""Continuing download loop. When this option is set every 5 Minutes the program searches for
|
||||
new images""")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def download_subreddits(subreddits, count, output):
|
||||
for sub in subreddits:
|
||||
print('[~] Downloading %s' % sub)
|
||||
download_subreddit(sub, count=count, out=output)
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
options, subreddits = parser_init()
|
||||
count = options.count
|
||||
output = options.output
|
||||
if options.test:
|
||||
count = 1
|
||||
subreddits = ['python']
|
||||
output = 'test.zip'
|
||||
if options.loop:
|
||||
while True:
|
||||
download_subreddits(subreddits, count, output)
|
||||
print('[~] Next Download in 5 minues...')
|
||||
time.sleep(300)
|
||||
else:
|
||||
download_subreddits(subreddits, count, output)
|
||||
cleanup()
|
||||
if options.test:
|
||||
os.remove(output)
|
||||
if len(errors.keys()) > 0:
|
||||
print('[-] Following errors occured:')
|
||||
for key in errors.keys():
|
||||
print(' %s times: %s' % (errors[key], key))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue