|
|
@ -8,11 +8,11 @@ import optparse
|
|
|
|
import asyncio
|
|
|
|
import asyncio
|
|
|
|
import shutil
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
redditurl: str = 'https://old.reddit.com/r/%s'
|
|
|
|
redditurl: str = 'https://old.reddit.com/r/%s' # the url for reddit with %s to insert the subreddit name
|
|
|
|
dl_dir: str = './.cache/' # Format must be ./
|
|
|
|
dl_dir: str = './.cache/' # Format must be ./ # the directory where files are cached. Will be created if it doesn't exist
|
|
|
|
img_ext: List[str] = ['jpg', 'png', 'bmp']
|
|
|
|
img_ext: List[str] = ['jpg', 'png', 'bmp'] # file extensions that are images
|
|
|
|
blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com']
|
|
|
|
blacklist: List[str] = ['b.thumbs.redditmedia.com', 'reddit.com'] # where images shouldn't be downloaded from
|
|
|
|
hdr: Dict[str, str] = {
|
|
|
|
hdr: Dict[str, str] = { # request header
|
|
|
|
'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)
|
|
|
|
'User-Agent': """Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko)
|
|
|
|
Chrome/23.0.1271.64 Safari/537.11""",
|
|
|
|
Chrome/23.0.1271.64 Safari/537.11""",
|
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
|
@ -20,6 +20,7 @@ hdr: Dict[str, str] = {
|
|
|
|
'Connection': 'keep-alive'}
|
|
|
|
'Connection': 'keep-alive'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# prints a progress bar
|
|
|
|
def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
|
|
|
|
def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█'):
|
|
|
|
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
|
|
|
|
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
|
|
|
|
filled_length = int(length * iteration // total)
|
|
|
|
filled_length = int(length * iteration // total)
|
|
|
@ -30,6 +31,7 @@ def print_progress(iteration, total, prefix='', suffix='', decimals=1, length=10
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# returns a soup for the given url
|
|
|
|
async def request_soup(url):
|
|
|
|
async def request_soup(url):
|
|
|
|
req = urlreq.Request(url, headers=hdr)
|
|
|
|
req = urlreq.Request(url, headers=hdr)
|
|
|
|
html = None
|
|
|
|
html = None
|
|
|
@ -44,6 +46,7 @@ async def request_soup(url):
|
|
|
|
return soup
|
|
|
|
return soup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# returns all images for the given url
|
|
|
|
async def get_img_as(url):
|
|
|
|
async def get_img_as(url):
|
|
|
|
soup = await request_soup(url)
|
|
|
|
soup = await request_soup(url)
|
|
|
|
ret = []
|
|
|
|
ret = []
|
|
|
@ -56,6 +59,7 @@ async def get_img_as(url):
|
|
|
|
return ret
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# returns the last post id in the given reddit page
|
|
|
|
async def get_next(url):
|
|
|
|
async def get_next(url):
|
|
|
|
ids = []
|
|
|
|
ids = []
|
|
|
|
soup = await request_soup(url)
|
|
|
|
soup = await request_soup(url)
|
|
|
@ -69,6 +73,7 @@ async def get_next(url):
|
|
|
|
return [_id for _id in ids if _id][-1]
|
|
|
|
return [_id for _id in ids if _id][-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# returns if the given tag has a source attribute that is an image
|
|
|
|
def has_source(tag):
|
|
|
|
def has_source(tag):
|
|
|
|
if tag.has_attr('src'):
|
|
|
|
if tag.has_attr('src'):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
@ -85,6 +90,7 @@ def has_source(tag):
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# downloads all images for the given url and puts them in a zipfile
|
|
|
|
async def download_async(url, zfile=None, test=False):
|
|
|
|
async def download_async(url, zfile=None, test=False):
|
|
|
|
images = await get_img_as(url)
|
|
|
|
images = await get_img_as(url)
|
|
|
|
print('[+] Found %s images' % len(images))
|
|
|
|
print('[+] Found %s images' % len(images))
|
|
|
@ -127,6 +133,7 @@ async def download_async(url, zfile=None, test=False):
|
|
|
|
print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url))
|
|
|
|
print('[+] %s images downloaded | %s finished %s' % (savedcount, logmsg, url))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# loops over reddit-pages until no more images are found
|
|
|
|
async def dl_loop(section, zfile, loop, chaos=False, test=False):
|
|
|
|
async def dl_loop(section, zfile, loop, chaos=False, test=False):
|
|
|
|
baseurl = redditurl % section
|
|
|
|
baseurl = redditurl % section
|
|
|
|
url = baseurl
|
|
|
|
url = baseurl
|
|
|
@ -151,6 +158,7 @@ async def dl_loop(section, zfile, loop, chaos=False, test=False):
|
|
|
|
await asyncio.sleep(0.1)
|
|
|
|
await asyncio.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# the main function
|
|
|
|
def main(sections, opts):
|
|
|
|
def main(sections, opts):
|
|
|
|
chaos = opts.chaos
|
|
|
|
chaos = opts.chaos
|
|
|
|
if not os.path.exists(dl_dir):
|
|
|
|
if not os.path.exists(dl_dir):
|
|
|
|