From 61846e81807fde40ddd12476fd284dc2d7472f04 Mon Sep 17 00:00:00 2001 From: trivernis Date: Mon, 24 Feb 2020 15:22:05 +0100 Subject: [PATCH] Add option to compress the data --- README.md | 8 +++++--- lib/io.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ miner.py | 20 +++++++++++++------- 3 files changed, 69 insertions(+), 10 deletions(-) create mode 100644 lib/io.py diff --git a/README.md b/README.md index 35bd88c..c461837 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ pipenv install ## Usage ``` -usage: miner.py [-h] [-t] -o OUTPUT_DIR [-i INTERVAL] [-m METHOD] [-p PAYLOAD_FILE] url +usage: miner.py [-h] [-t] -o OUTPUT_DIR [-i INTERVAL] [-m METHOD] [-b BODY] [-p TOR_PASSWORD] [-z] url [url ...] Periodically mine data @@ -32,8 +32,10 @@ optional arguments: The interval in which the data is requested -m METHOD, --method METHOD The HTTP method that is used - -p PAYLOAD_FILE, --payload-file PAYLOAD_FILE - The file containing the requests payload. + -b BODY, --body BODY The file containing the requests payload/body. + -p TOR_PASSWORD, --tor-password TOR_PASSWORD + The password used for the tor control port. + -z, --compress If the data should be compressed ``` ## License diff --git a/lib/io.py b/lib/io.py new file mode 100644 index 0000000..10d3dd7 --- /dev/null +++ b/lib/io.py @@ -0,0 +1,51 @@ +import os +from os import path +import zipfile +import tempfile + + +class FileManager: + + def __init__(self, data_dir, directories: [str], compress=False, tmp='.tmp'): + self._dirs = directories + self.compress = compress + self._data_dir = data_dir + self._zips = {} + self._tmpdir = tempfile.gettempdir() + if not compress: + self._open_directories() + + def _open_directories(self): + for d in self._dirs: + if not path.exists(path.join(self._data_dir, d)): + os.mkdir(path.join(self._data_dir, d)) + + def get_file(self, directory, name): + """ + Opens a new file with the given name in the directory + :param directory: + :param name: + :return: + """ + if self.compress: + return open(path.join(self._tmpdir, name), 'w') + else: + return open(path.join(self._data_dir, directory, name), 'w') + + def store_file(self, directory, name): + """ + Adds the file to the tarfile if compression is active. + :param directory: + :param name: + :return: + """ + if self.compress: + mode = 'w' + z_name = path.join(self._data_dir, directory + '.zip') + if path.exists(z_name): + mode = 'a' + with zipfile.ZipFile(z_name, mode, compression=zipfile.ZIP_LZMA) as zf: + f_path = path.join(self._tmpdir, name) + zf.write(f_path, name) + zf.close() + os.remove(f_path) diff --git a/miner.py b/miner.py index 79e4359..b4e2b86 100755 --- a/miner.py +++ b/miner.py @@ -2,8 +2,10 @@ import argparse from lib.client import TorClient, Client from lib.utils import parse_duration +from lib.io import FileManager import time import os +from os import path import mimetypes import base64 import hashlib @@ -25,10 +27,11 @@ def parse_arguments(): parser.add_argument('-m', '--method', default='GET', type=str, help='The HTTP method that is used') parser.add_argument('-b', '--body', type=str, help='The file containing the requests payload/body.') parser.add_argument('-p', '--tor-password', type=str, help='The password used for the tor control port.') + parser.add_argument('-z', '--compress', action='store_true', help='If the data should be compressed') return parser.parse_args() -def request_loop(client: Client, urls: [str], out_dir: str, method: str = 'GET', interval=1800, body=None): +def request_loop(client: Client, urls: [str], fm: FileManager, method: str = 'GET', interval=1800, body=None): while True: try: for url in urls: @@ -37,9 +40,12 @@ def request_loop(client: Client, urls: [str], out_dir: str, method: str = 'GET', extension = mimetypes.guess_extension(req.headers['content-type'].split(';')[0]) print('[+] Request to %s succeeded: mime: %s, timing: %ss' % (url, req.headers['content-type'], req.elapsed.total_seconds())) - with open('%s/%s/%s%s' % (out_dir, get_folder_name(url), - time.strftime('%m-%d-%y_%H-%M-%S'), extension), 'w') as f: + d = get_folder_name(url) + f_name = time.strftime('%m-%d-%y_%H-%M-%S') + extension + with fm.get_file(d, f_name) as f: f.write(req.text) + fm.store_file(d, f_name) + print('[+] Successfully stored response data as %s ' % f_name) else: print('[-] Request failed with code %s: %s' % (req.status_code, req.text)) client.reset() @@ -73,19 +79,19 @@ def main(): mapping = json.load(mf) except Exception as e: print(e) + dirs = [] for url in args.url: folder_name = get_folder_name(url) - folder_path = '%s/%s' % (args.output_dir, folder_name) mapping[url] = folder_name - if not os.path.exists(folder_path): - os.mkdir(folder_path) + dirs.append(folder_name) with open(mapping_file, 'w') as mf: json.dump(mapping, mf, indent=' ') body = None if args.body: body = open(args.body, 'rb') + fm = FileManager(args.output_dir, dirs, compress=args.compress) print('[ ] Starting request loop...') - request_loop(client, args.url, args.output_dir, args.method, int(interval.total_seconds()), body=body) + request_loop(client, args.url, fm, args.method, int(interval.total_seconds()), body=body) if __name__ == '__main__':