Added full functionalities to riddle2.py

- -c for number of images to download
pull/7/head
Julius 6 years ago
parent a513b5348d
commit feaca33300

@ -1,10 +1,13 @@
# encoding=utf-8
import urllib.request as urlreq import urllib.request as urlreq
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import zipfile import zipfile
import time import time
import os import os
import sys import sys
import optparse
import shutil
blacklist = ['b.thumbs.redditmedia.com', 'reddit.com'] blacklist = ['b.thumbs.redditmedia.com', 'reddit.com']
dl_dir = './.cache/' dl_dir = './.cache/'
@ -15,6 +18,7 @@ hdr = { # request header
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'} 'Connection': 'keep-alive'}
errors = {}
def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill=''): def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill=''):
@ -46,8 +50,11 @@ def get_soup4url(url):
html = urlreq.urlopen(req).read() html = urlreq.urlopen(req).read()
break break
except Exception as e: except Exception as e:
print('[-]', e) if errors[e]:
if html: errors[e] += 1
else:
errors[e] = 1
if html:
soup = BeautifulSoup(html, "lxml") soup = BeautifulSoup(html, "lxml")
return soup return soup
return False return False
@ -98,7 +105,7 @@ def get_img4site(url):
sys.stdout.flush() sys.stdout.flush()
for t in soup.find_all(has_source): for t in soup.find_all(has_source):
try: try:
if 'redditmedia' not in t['src']: if 'redditmedia' not in t['src'] and 'icon' not in t['src']:
img = t['src'] img = t['src']
if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]: if 'http' not in img.split('/')[0] and '//' not in img.split('.')[0]:
img = url + img img = url + img
@ -118,37 +125,52 @@ def get_img4sub(url, length=-1):
imgs = [] imgs = []
print('[ ] 1/2 Getting images...') print('[ ] 1/2 Getting images...')
if length >= 0: if length >= 0:
for x in range(length): x = 0
while x < length:
time.sleep(0.1) # we don't want to flood with requests time.sleep(0.1) # we don't want to flood with requests
imgs.extend(get_img4site(url)) imgurls = get_img4site(url)
if not imgurls:
break
imgs.extend(imgurls)
x = len(imgs)
url = get_next_url(baseurl, url) url = get_next_url(baseurl, url)
if not url: if not url:
break break
sys.stdout.write('\b') sys.stdout.write('\b')
imgs = imgs[:length]
else: else:
while url: while url:
time.sleep(0.1) # we don't want to flood with requests time.sleep(0.1) # we don't want to flood with requests
imgs.extend(get_img4site(url)) imgurls = get_img4site(url)
if not imgurls:
break
imgs.extend(imgurls)
url = get_next_url(baseurl, url) url = get_next_url(baseurl, url)
print('[+] Found %s images' % len(imgs))
return imgs return imgs
def download_images(imgs, zfile): def download_images(imgs, zfile):
count = 0 count = 1
imgcount = len(imgs) imgcount = len(imgs)
fnames = [zinfo.filename for zinfo in zfile.infolist()]
print('[ ] Downloading %s images' % imgcount) print('[ ] Downloading %s images' % imgcount)
if not os.path.isdir(dl_dir): if not os.path.isdir(dl_dir):
os.mkdir(dl_dir) os.mkdir(dl_dir)
print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete")
for img in imgs: for img in imgs:
print_progress(count+1, imgcount, prefix="2/2 Downloading: ", suffix="Complete") print_progress(count, imgcount, prefix="2/2 Downloading: ", suffix="Complete")
imgname = img.split('/')[-1] imgname = img.split('/')[-1]
name = dl_dir + imgname name = dl_dir + imgname
if os.path.isfile(name): if os.path.isfile(name) or imgname in fnames:
count += 1
continue continue
f = open(name, "wb") f = open(name, "wb")
req = urlreq.Request(img, headers=hdr) req = urlreq.Request(img, headers=hdr)
image = urlreq.urlopen(req) try:
image = urlreq.urlopen(req)
except ConnectionError:
print('\n [-] Connection Error')
return
f.write(image.read()) f.write(image.read())
f.close() f.close()
zfile.write(name, imgname, zipfile.ZIP_DEFLATED) zfile.write(name, imgname, zipfile.ZIP_DEFLATED)
@ -158,18 +180,53 @@ def download_images(imgs, zfile):
pass pass
time.sleep(0.1) # no don't penetrate time.sleep(0.1) # no don't penetrate
count += 1 count += 1
added = len(zfile.infolist()) - len(fnames)
print('[+] Added %s files to the zipfile' % added)
def download_subreddit(sub): def download_subreddit(sub, count=-1):
mode = 'w' mode = 'w'
if os.path.isfile(sub + '.zip'): if os.path.isfile(sub + '.zip'):
mode = 'a' mode = 'a'
url = 'https://old.reddit.com/r/%s/' % sub url = 'https://old.reddit.com/r/%s/' % sub
imgs = get_img4sub(url) imgs = get_img4sub(url, length=count)
zfile = zipfile.ZipFile('%s.zip' % sub, mode) zfile = zipfile.ZipFile('%s.zip' % sub, mode)
download_images(imgs, zfile) download_images(imgs, zfile)
zfile.close() zfile.close()
def cleanup():
print('[ ] Cleanup...')
if os.path.isdir(dl_dir):
shutil.rmtree(dl_dir)
def parser_init():
parser = optparse.OptionParser(usage="usage: %prog [options] [subreddits]")
parser.add_option('-c', '--count', dest='count',
type='int', default=-1,
help='The number of images to download.')
parser.add_option('-t', '--test', dest='test',
action='store_true', default=False,
help='Tests the functions of the script')
return parser.parse_args()
def main():
options, subreddits = parser_init()
if options.count:
count = options.count
else:
count = -1
if options.test:
count = 1
subreddits = ['python']
for sub in subreddits:
print('[ ] Downloading %s' % sub)
download_subreddit(sub, count=count)
cleanup()
print(errors)
if __name__ == '__main__': if __name__ == '__main__':
download_subreddit('Animewallpaper') main()

Loading…
Cancel
Save