From 0d76d43c00e9759fefa87e0a8184739997926eee Mon Sep 17 00:00:00 2001 From: trivernis Date: Sun, 23 Feb 2020 12:25:26 +0100 Subject: [PATCH] Add support for multiple urls and store data in subfolders --- Pipfile | 1 + miner.py | 49 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Pipfile b/Pipfile index 4d7b373..54866f8 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,7 @@ verify_ssl = true requests = "*" fake-useragent = "*" stem = "*" +toml = "*" [requires] python_version = "3.8" diff --git a/miner.py b/miner.py index cdf4fb3..82dd7ed 100755 --- a/miner.py +++ b/miner.py @@ -5,11 +5,20 @@ from lib.utils import parse_duration import time import os import mimetypes +import base64 +import hashlib +import json + + +def get_folder_name(url: str) -> str: + m = hashlib.sha256() + m.update(url.encode('utf-8')) + return base64.urlsafe_b64encode(m.digest()).decode('utf-8') def parse_arguments(): parser = argparse.ArgumentParser(description='Periodically mine data') - parser.add_argument('url', type=str, help='the data endpoint url') + parser.add_argument('url', type=str, help='the data endpoint url', nargs='+') parser.add_argument('-t', '--tor', action='store_true', help='If tor should be used for requests') parser.add_argument('-o', '--output-dir', required=True, type=str, help='The output directory for the data') parser.add_argument('-i', '--interval', default='1h', type=str, help='The interval in which the data is requested') @@ -18,18 +27,20 @@ def parse_arguments(): return parser.parse_args() -def request_loop(client: Client, url: str, out_dir: str, method: str = 'GET', interval=1800, body=None): +def request_loop(client: Client, urls: [str], out_dir: str, method: str = 'GET', interval=1800, body=None): while True: try: - req = client.request(url, method=method, data=body) - if req.status_code == 200: - extension = mimetypes.guess_extension(req.headers['content-type'].split(';')[0]) - print('[+] Request succeeded: mime: %s, timing: %ss' % - (req.headers['content-type'], req.elapsed.total_seconds())) - with open(out_dir + '/%s%s' % (time.strftime('%m-%d-%y_%H-%M-%S'), extension), 'w') as f: - f.write(req.text) - else: - print('[-] Request failed with code %s: %s' % (req.status_code, req.text)) + for url in urls: + req = client.request(url, method=method, data=body) + if req.status_code == 200: + extension = mimetypes.guess_extension(req.headers['content-type'].split(';')[0]) + print('[+] Request to %s succeeded: mime: %s, timing: %ss' % + (url, req.headers['content-type'], req.elapsed.total_seconds())) + with open('%s/%s/%s%s' % (out_dir, get_folder_name(url), + time.strftime('%m-%d-%y_%H-%M-%S'), extension), 'w') as f: + f.write(req.text) + else: + print('[-] Request failed with code %s: %s' % (req.status_code, req.text)) client.reset() print('[ ] Pausing for %ss' % interval) time.sleep(interval) @@ -53,6 +64,22 @@ def main(): client = Client() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) + mapping = {} + mapping_file = '%s/mapping.json' % args.output_dir + if os.path.exists(mapping_file): + with open(mapping_file, 'r') as mf: + try: + mapping = json.load(mf) + except Exception as e: + print(e) + for url in args.url: + folder_name = get_folder_name(url) + folder_path = '%s/%s' % (args.output_dir, folder_name) + mapping[url] = folder_name + if not os.path.exists(folder_path): + os.mkdir(folder_path) + with open(mapping_file, 'w') as mf: + json.dump(mapping, mf, indent=' ') body = None if args.payload_file: body = open(args.payload_file, 'rb')