diff options
Diffstat (limited to 'dl-from-db.py')
-rwxr-xr-x | dl-from-db.py | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/dl-from-db.py b/dl-from-db.py new file mode 100755 index 0000000..526c3a4 --- /dev/null +++ b/dl-from-db.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) + +import logging + +from mdf import Database, retronews +from argparse import ArgumentParser + +database = Database() + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--output', type=str, required=True, + help='output directory') + parser.add_argument('--from-date', type=str) + parser.add_argument('--to-date', type=str) + parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int) + parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int) + parser.add_argument('--only-fetch', action='store_true', + help='only fetch magazine tiles and exit, do not merge anything') + parser.add_argument('--force-overwrite', action='store_true', + help='if file yyyy-mm-dd.pdf already exists, delete it and start over') + parser.add_argument('--force-probe', action='store_true', + help='force all pages to use the \'probe\' method') + parser.add_argument('--fetch-probe-pages', nargs='+', type=int, + help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') + + args = parser.parse_args() + + retronews.set_tile_merging_pool_size(args.merge_threads) + retronews.set_page_fetching_pool_size(args.fetch_threads) + + for doc in database.get_documents((args.from_date, args.to_date)): + url = doc['url'] + print(f'grabbing {url}...') + if not retronews.grab_magazine(url, + output_root=args.output, + probe_pages=args.fetch_probe_pages, + probe_all=args.force_probe, + only_fetch=args.only_fetch, + force_overwrite=args.force_overwrite): + logging.error(f'failed to grab {url}') |