From 209c6404eb274c9f31e6ce847d365ca82d1bdac9 Mon Sep 17 00:00:00 2001 From: Evgeny Zinoviev Date: Thu, 2 Feb 2023 07:50:34 +0300 Subject: initial --- grab-links.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 grab-links.py (limited to 'grab-links.py') diff --git a/grab-links.py b/grab-links.py new file mode 100755 index 0000000..286794f --- /dev/null +++ b/grab-links.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +import re +import requests +import traceback + +from retronews import convert_date, parse_url +from argparse import ArgumentParser, ArgumentError +from database import Database + + +if __name__ == '__main__': + database = Database() + + parser = ArgumentParser() + parser.add_argument('--initial-url', type=str, required=True) + parser.add_argument('--forwards', action='store_true') + parser.add_argument('--backwards', action='store_true') + + args = parser.parse_args() + if not args.forwards and not args.backwards: + raise ArgumentError('either --forwards or --backwards is required!') + + url = args.initial_url + while True: + print(f'grabbing {url}...') + try: + pub_date, collection_id, doc_id = parse_url(url) + except AttributeError: + break + + r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}') + data = r.json() + pages = int(data['nbPages']) + + y, m, d = convert_date(pub_date) + issue_date = f'{y}-{m}-{d}' + + print(f'adding {issue_date}') + database.add_link(issue_date, url, pages) + + r = requests.get(url) + + try: + if args.forwards: + next_url = re.search(r'SUIVANT', r.text, re.S).groups()[0] + elif args.backwards: + next_url = re.search(r'\s+\s+PRÉCÉDENT', r.text, re.S).groups()[0] + + if next_url.startswith('/'): + next_url = f'https://www.retronews.fr{next_url}' + + url = next_url + except: + traceback.print_exc() + print('error: failed to find previous link! exiting') + break -- cgit v1.2.3