diff options
author | Evgeny Zinoviev <me@ch1p.io> | 2023-02-02 07:50:34 +0300 |
---|---|---|
committer | Evgeny Zinoviev <me@ch1p.io> | 2023-02-02 07:50:34 +0300 |
commit | 209c6404eb274c9f31e6ce847d365ca82d1bdac9 (patch) | |
tree | 907ff5b651fd18c22eb7a9460a740ecff8f36c46 /grab-links.py |
initial
Diffstat (limited to 'grab-links.py')
-rwxr-xr-x | grab-links.py | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/grab-links.py b/grab-links.py new file mode 100755 index 0000000..286794f --- /dev/null +++ b/grab-links.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +import re +import requests +import traceback + +from retronews import convert_date, parse_url +from argparse import ArgumentParser, ArgumentError +from database import Database + + +if __name__ == '__main__': + database = Database() + + parser = ArgumentParser() + parser.add_argument('--initial-url', type=str, required=True) + parser.add_argument('--forwards', action='store_true') + parser.add_argument('--backwards', action='store_true') + + args = parser.parse_args() + if not args.forwards and not args.backwards: + raise ArgumentError('either --forwards or --backwards is required!') + + url = args.initial_url + while True: + print(f'grabbing {url}...') + try: + pub_date, collection_id, doc_id = parse_url(url) + except AttributeError: + break + + r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}') + data = r.json() + pages = int(data['nbPages']) + + y, m, d = convert_date(pub_date) + issue_date = f'{y}-{m}-{d}' + + print(f'adding {issue_date}') + database.add_link(issue_date, url, pages) + + r = requests.get(url) + + try: + if args.forwards: + next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] + elif args.backwards: + next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] + + if next_url.startswith('/'): + next_url = f'https://www.retronews.fr{next_url}' + + url = next_url + except: + traceback.print_exc() + print('error: failed to find previous link! exiting') + break |