diff options
author | Evgeny Zinoviev <me@ch1p.io> | 2023-12-30 15:37:08 +0300 |
---|---|---|
committer | Evgeny Zinoviev <me@ch1p.io> | 2023-12-30 15:37:08 +0300 |
commit | abd1975def213891afdf0d87adbf79c2c7dbc0cb (patch) | |
tree | 4c0feb5ecc7f8dcf5c17e74a81a2bae9ba6c649f /dl-retronews.py | |
parent | 38474234431fcf1dae50c00cdf649b4102c0faf9 (diff) |
Diffstat (limited to 'dl-retronews.py')
-rwxr-xr-x | dl-retronews.py | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/dl-retronews.py b/dl-retronews.py new file mode 100755 index 0000000..6ee4325 --- /dev/null +++ b/dl-retronews.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) + +import re +import requests +import logging + +from mdf import retronews +from argparse import ArgumentParser + + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--url', type=str, required=True) + parser.add_argument('--output', type=str, required=True, + help='output directory') + parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int) + parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int) + parser.add_argument('--continue-prev', action='store_true', + help='keep scrapping backwards in time') + parser.add_argument('--continue-next', action='store_true', + help='keep scrapping forwards in time') + parser.add_argument('--only-fetch', action='store_true', + help='only fetch magazine tiles and exit, do not merge anything') + parser.add_argument('--force-overwrite', action='store_true', + help='if file yyyy-mm-dd.pdf already exists, delete it and start over') + parser.add_argument('--force-probe', action='store_true', + help='force all pages to use the \'probe\' method') + parser.add_argument('--fetch-probe-pages', nargs='+', type=int, + help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') + + args = parser.parse_args() + + with_continuation = args.continue_prev or args.continue_next + if args.fetch_probe_pages and with_continuation: + raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack') + if args.only_fetch and with_continuation: + raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack') + + TILE_MERGING_POOL_SIZE = args.merge_threads + PAGE_FETCHING_POOL_SIZE = args.fetch_threads + + url = args.url + while True: + print(f'grabbing {url}...') + if not retronews.grab_magazine(url, + output_root=args.output, + probe_pages=args.fetch_probe_pages, + probe_all=args.force_probe, + only_fetch=args.only_fetch, + force_overwrite=args.force_overwrite): + logging.error('failed to grab') + break + + if not args.continue_prev and not args.continue_next: + break + + r = requests.get(url) + + try: + next_url = None + if args.continue_next: + next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] + elif args.continue_prev: + next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] + + if not next_url: + break + + if next_url.startswith('/'): + next_url = f'https://www.retronews.fr{next_url}' + + url = next_url + + except: + print('error: failed to find previous link! exiting') + break |