#!/usr/bin/env python3 import re import requests import traceback from retronews import convert_date, parse_url from argparse import ArgumentParser, ArgumentError from database import Database if __name__ == '__main__': database = Database() parser = ArgumentParser() parser.add_argument('--initial-url', type=str, required=True) parser.add_argument('--forwards', action='store_true') parser.add_argument('--backwards', action='store_true') args = parser.parse_args() if not args.forwards and not args.backwards: raise ArgumentError('either --forwards or --backwards is required!') url = args.initial_url while True: print(f'grabbing {url}...') try: pub_date, collection_id, doc_id = parse_url(url) except AttributeError: break r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}') data = r.json() pages = int(data['nbPages']) y, m, d = convert_date(pub_date) issue_date = f'{y}-{m}-{d}' print(f'adding {issue_date}') database.add_link(issue_date, url, pages) r = requests.get(url) try: if args.forwards: next_url = re.search(r'SUIVANT', r.text, re.S).groups()[0] elif args.backwards: next_url = re.search(r'\s+\s+PRÉCÉDENT', r.text, re.S).groups()[0] if next_url.startswith('/'): next_url = f'https://www.retronews.fr{next_url}' url = next_url except: traceback.print_exc() print('error: failed to find previous link! exiting') break