diff options
Diffstat (limited to 'grab-retronews.py')
-rwxr-xr-x | grab-retronews.py | 399 |
1 files changed, 399 insertions, 0 deletions
diff --git a/grab-retronews.py b/grab-retronews.py new file mode 100755 index 0000000..fbd1b28 --- /dev/null +++ b/grab-retronews.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +import os +import sys +import json +import re +import imghdr +import requests +import urllib.request +import urllib.error +import http.client +import subprocess +import shutil +import queue +import traceback +import retronews + +from database import Database +from typing import Optional +from threading import Thread, Lock +from time import sleep +from argparse import ArgumentParser + +VTILES = 3 +HTILES = 2 +TILE_MERGING_POOL_SIZE = 8 +PAGE_FETCHING_POOL_SIZE = 8 + +database = Database() +print_lock = Lock() + +pages_queue = queue.Queue() +merging_queue = queue.Queue() + + +def safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + + +def run(args: list, **kwargs): + p = subprocess.run(args, **kwargs) + if p.returncode != 0: + raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')') + + +class DownloaderThread(Thread): + _url: str + _save_as: str + _download_result: Optional[bool] + + def __init__(self, url: str, save_as: str, thread_name=None): + super().__init__() + if thread_name: + self.name = thread_name + + self._url = url + self._save_as = save_as + self._download_result = None + + def run(self): + self._download_result = download_file(self._url, self._save_as) + + def is_downloaded(self) -> bool: + return self._download_result is True + + +class TileMergeWorker(Thread): + _working_dir: str + _number: int + + def __init__(self, working_dir: str, number: int): + super().__init__() + self._working_dir = working_dir + self._number = number + + def run(self): + safe_print(f'[tile merger {self._number}] started') + + while not merging_queue.empty(): + try: + page = merging_queue.get_nowait() + page_dir = os.path.join(self._working_dir, str(page)) + thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg') + meta_path = os.path.join(page_dir, 'meta.json') + + if os.path.exists(thumbnail_path): + shutil.copy(thumbnail_path, os.path.join(self._working_dir, f'{page}.jpg')) + continue + + if os.path.exists(meta_path): + with open(meta_path, 'r') as f: + meta = json.loads(f.read()) + htiles = meta['h'] + vtiles = meta['v'] + else: + htiles = HTILES + vtiles = VTILES + + hfiles = [] + for h in range(htiles): + vfiles = [] + for v in range(vtiles): + vfiles.append(f'v{v}_h{h}.jpg') + run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir) + hfiles.append(f'_v_{h}.jpg') + + run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir) + shutil.rmtree(page_dir) + + safe_print(f'[tile merger {self._number}] page {page} done') + + except queue.Empty: + break + + +class PageFetchWorker(Thread): + _working_dir: str + _number: int + _failed: bool + _error: Optional[str] + _probe_pages: Optional[list[int]] + + def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None): + super().__init__() + self._working_dir = working_dir + self._number = number + self._collection_id = collection_id + self._doc_id = doc_id + self._failed = False + self._error = None + self._probe_pages = probe_pages + + def run(self): + safe_print(f'[pf-{self._number}] started') + page = 0 + + try: + while not pages_queue.empty(): + try: + page = pages_queue.get_nowait() + safe_print(f'[pf-{self._number}] page {page} started') + + if page in self._probe_pages: + self.probe_dl(page) + else: + try: + self.normal_dl(page) + except OSError: + safe_print(f'[pf-{self._number}] normal_dl() failed, trying probe_dl()') + self.probe_dl(page) + + except queue.Empty: + break + + except Exception as e: + self._failed = True + self._error = f'while fetching page {page}: {str(e)}' + + def _get_page_dir(self, page): + page_dir = os.path.join(self._working_dir, str(page)) + if not os.path.exists(page_dir): + os.makedirs(page_dir) + return page_dir + + def is_failed(self) -> bool: + return self._failed + + def get_error(self) -> str: + return self._error if self._error is not None else '' + + def normal_dl(self, page): + page_dir = self._get_page_dir(page) + dl_tasks = [] + for horiz_tile in range(HTILES): + for vert_tile in range(VTILES): + url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile) + output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg' + if os.path.isfile(output_file): + if os.path.getsize(output_file) < 4: + os.unlink(output_file) + # safe_print(f'[pf-{self._number}] already exists') + continue + + dl_tasks.append(DownloaderThread(url=url, + save_as=os.path.join(page_dir, output_file), + thread_name=f'p{page}-v{vert_tile}-h{horiz_tile}')) + + for task in dl_tasks: + task.start() + + data_error = False + + for task in dl_tasks: + task.join() + if not task.is_downloaded(): + # safe_print(f'failed to download file {task._url}') + raise OSError(f'network error, failed to download {task._url}') + + elif not imghdr.what(task._save_as): + data_error = True + + if data_error: + self.thumbnail_dl(page) + else: + safe_print(f'[pf-{self._number}] page {page}: all files saved') + + def probe_dl(self, page): + page_dir = self._get_page_dir(page) + real_h = 0 + real_v = 0 + data_error = False + for h in range(5): + for v in range(5): + url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v) + output_file = f'{page_dir}/v{v}_h{h}.jpg' + if os.path.isfile(output_file): + safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY') + if os.path.getsize(output_file) < 4: + os.unlink(output_file) + continue + try: + if not download_file(url, output_file, handle_http_errors=False): + raise OSError('network failure') + if not imghdr.what(output_file): + data_error = True + break + real_v = v + real_h = h + safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK') + + except urllib.error.HTTPError: + safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL') + break + + if data_error: + self.thumbnail_dl(page) + else: + with open(os.path.join(page_dir, 'meta.json'), 'w') as f: + f.write(json.dumps(dict(v=real_v+1, h=real_h+1))) + safe_print(f'[pf-{self._number}] page {page}: all files saved (seemingly...)') + + def thumbnail_dl(self, page): + page_dir = self._get_page_dir(page) + thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page) + if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')): + raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})') + safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail') + + +def download_file(url, output, handle_http_errors=True) -> bool: + tries_left = 3 + ok = False + while tries_left > 0: + try: + urllib.request.urlretrieve(url, output) + ok = True + break + except http.client.RemoteDisconnected: + ok = False + print(' caught an exception, sleeping for 2 seconds and retrying...') + sleep(2) + tries_left -= 1 + except urllib.error.HTTPError as e: + if not handle_http_errors: + raise e + else: + print(f' failed to download {url}: {str(e)}') + return False + return ok + + +def grab_magazine(url: str, + output_root: str, + probe_pages: Optional[list[int]] = None, + only_fetch=False, force_overwrite=False): + pub_date, collection_id, doc_id = retronews.parse_url(url) + + data = retronews.api_doc_info(collection_id, doc_id) + pages = int(data['nbPages']) + print(f'found {pages} pages') + + y, m, d = retronews.convert_date(pub_date) + if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')): + if not force_overwrite: + print(f'{y}-{m}-{d}.pdf already exists, not continuing') + return + else: + os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf')) + print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)') + + output_dir = os.path.join(output_root, pub_date) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # fetch pages + for page in range(pages): + pages_queue.put(page+1) + + pool = [] + for i in range(PAGE_FETCHING_POOL_SIZE): + pool.append(PageFetchWorker(working_dir=output_dir, + number=i+1, + collection_id=collection_id, + doc_id=doc_id, + probe_pages=probe_pages)) + for worker in pool: + worker.start() + + for worker in pool: + worker.join() + if worker.is_failed(): + with open(os.path.join(output_dir, 'error.txt'), 'w') as f: + f.write(f'error: {worker.get_error()}') + print(f'ERROR: failed to download {pub_date} magazine') + return + + if only_fetch: + return + + # merge tiles + for page in range(pages): + page += 1 + merging_queue.put(page) + + pool = [] + for i in range(TILE_MERGING_POOL_SIZE): + pool.append(TileMergeWorker(working_dir=output_dir, number=i+1)) + for worker in pool: + worker.start() + try: + for worker in pool: + worker.join() + + # merge images into pdf + files = [str(page + 1) + '.jpg' for page in range(pages)] + run(['convert', *files, os.path.join(output_root, f'{y}-{m}-{d}.pdf')], cwd=output_dir) + shutil.rmtree(output_dir) + except: + traceback.print_exc() + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--url', type=str, required=True) + parser.add_argument('--output', type=str, required=True, + help='output directory') + parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int) + parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int) + parser.add_argument('--continue-prev', action='store_true', + help='keep scrapping backwards in time') + parser.add_argument('--continue-next', action='store_true', + help='keep scrapping forwards in time') + parser.add_argument('--only-fetch', action='store_true', + help='only fetch magazine tiles and exit, do not merge anything') + parser.add_argument('--force-overwrite', action='store_true', + help='if file yyyy-mm-dd.pdf already exists, delete it and start over') + parser.add_argument('--fetch-probe-pages', nargs='+', type=int, + help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') + + args = parser.parse_args() + + with_continuation = args.continue_prev or args.continue_next + if args.fetch_probe_pages and with_continuation: + raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack') + if args.only_fetch and with_continuation: + raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack') + + TILE_MERGING_POOL_SIZE = args.merge_threads + PAGE_FETCHING_POOL_SIZE = args.fetch_threads + + url = args.url + while True: + print(f'grabbing {url}...') + grab_magazine(url, + output_root=args.output, + probe_pages=args.fetch_probe_pages, + only_fetch=args.only_fetch, + force_overwrite=args.force_overwrite) + + if not args.continue_prev and not args.continue_next: + break + + r = requests.get(url) + + try: + if args.continue_next: + next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] + elif args.continue_prev: + next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] + + if next_url.startswith('/'): + next_url = f'https://www.retronews.fr{next_url}' + + url = next_url + + except: + print('error: failed to find previous link! exiting') + break + |