diff options
-rw-r--r-- | .gitignore | 5 | ||||
-rwxr-xr-x | check-ratio.py | 4 | ||||
-rw-r--r-- | database/__init__.py | 1 | ||||
-rw-r--r-- | database/database.py (renamed from database.py) | 2 | ||||
-rwxr-xr-x | grab-retronews.py | 132 | ||||
-rw-r--r-- | retronews/__init__.py | 8 | ||||
-rw-r--r-- | retronews/retronews.py (renamed from retronews.py) | 4 |
7 files changed, 115 insertions, 41 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..49938ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/.idea +/venv +/*.sqlite3 +__pycache__ +/test.py
\ No newline at end of file diff --git a/check-ratio.py b/check-ratio.py index bd546bf..6c85283 100755 --- a/check-ratio.py +++ b/check-ratio.py @@ -20,5 +20,5 @@ if __name__ == '__main__': print(f'ERROR: {doc["collection_id"]}/{doc["page_id"]}/{page}: width or height is zero') continue ratio = width/height - # TODO - # print(f'[]')
\ No newline at end of file + if ratio >= 0.8: + print(f'{doc["collection_id"]}/{doc["doc_id"]}/{page}: {ratio}') diff --git a/database/__init__.py b/database/__init__.py new file mode 100644 index 0000000..ef3f969 --- /dev/null +++ b/database/__init__.py @@ -0,0 +1 @@ +from .database import Database diff --git a/database.py b/database/database.py index b67f4d0..04902f1 100644 --- a/database.py +++ b/database/database.py @@ -13,7 +13,7 @@ class Database: def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3') + file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3') self.sqlite = sqlite3.connect(file, check_same_thread=False) self.lock = threading.Lock() diff --git a/grab-retronews.py b/grab-retronews.py index fbd1b28..ac4dbf1 100755 --- a/grab-retronews.py +++ b/grab-retronews.py @@ -1,4 +1,8 @@ #!/usr/bin/env python3 +import logging +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) + import os import sys import json @@ -13,6 +17,7 @@ import shutil import queue import traceback import retronews +import logging from database import Database from typing import Optional @@ -20,6 +25,8 @@ from threading import Thread, Lock from time import sleep from argparse import ArgumentParser +warnings.filterwarnings("ignore", category=DeprecationWarning) + VTILES = 3 HTILES = 2 TILE_MERGING_POOL_SIZE = 8 @@ -47,18 +54,27 @@ class DownloaderThread(Thread): _url: str _save_as: str _download_result: Optional[bool] + _handle_http: bool + user_info: dict - def __init__(self, url: str, save_as: str, thread_name=None): + def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None): super().__init__() + if user_info is None: + user_info = {} if thread_name: self.name = thread_name self._url = url self._save_as = save_as self._download_result = None + self._handle_http = handle_http + self.user_info = user_info def run(self): - self._download_result = download_file(self._url, self._save_as) + try: + self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http) + except urllib.error.HTTPError: + pass def is_downloaded(self) -> bool: return self._download_result is True @@ -80,7 +96,7 @@ class TileMergeWorker(Thread): try: page = merging_queue.get_nowait() page_dir = os.path.join(self._working_dir, str(page)) - thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg') + thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg') meta_path = os.path.join(page_dir, 'meta.json') if os.path.exists(thumbnail_path): @@ -100,12 +116,12 @@ class TileMergeWorker(Thread): for h in range(htiles): vfiles = [] for v in range(vtiles): - vfiles.append(f'v{v}_h{h}.jpg') - run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir) - hfiles.append(f'_v_{h}.jpg') + vfiles.append(f'{h}x{v}.jpg') + run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir) + hfiles.append(f'{h}.jpg') run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir) - shutil.rmtree(page_dir) + # shutil.rmtree(page_dir) safe_print(f'[tile merger {self._number}] page {page} done') @@ -119,8 +135,9 @@ class PageFetchWorker(Thread): _failed: bool _error: Optional[str] _probe_pages: Optional[list[int]] + _probe_all: bool - def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None): + def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False): super().__init__() self._working_dir = working_dir self._number = number @@ -129,6 +146,7 @@ class PageFetchWorker(Thread): self._failed = False self._error = None self._probe_pages = probe_pages + self._probe_all = probe_all def run(self): safe_print(f'[pf-{self._number}] started') @@ -140,7 +158,7 @@ class PageFetchWorker(Thread): page = pages_queue.get_nowait() safe_print(f'[pf-{self._number}] page {page} started') - if page in self._probe_pages: + if self._probe_all or page in self._probe_pages: self.probe_dl(page) else: try: @@ -209,28 +227,52 @@ class PageFetchWorker(Thread): real_h = 0 real_v = 0 data_error = False - for h in range(5): - for v in range(5): + dl_tasks = [] + for h in range(10): + for v in range(10): url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v) - output_file = f'{page_dir}/v{v}_h{h}.jpg' + output_file = f'{page_dir}/{h}x{v}.jpg' if os.path.isfile(output_file): safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY') if os.path.getsize(output_file) < 4: os.unlink(output_file) continue - try: - if not download_file(url, output_file, handle_http_errors=False): - raise OSError('network failure') - if not imghdr.what(output_file): - data_error = True - break - real_v = v - real_h = h - safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK') - - except urllib.error.HTTPError: - safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL') - break + + dl_tasks.append(DownloaderThread(url=url, + save_as=os.path.join(page_dir, output_file), + handle_http=True, + thread_name=f'p{page}-v{v}-h{h}', + user_info=dict(h=h, v=v))) + + for task in dl_tasks: + task.start() + for task in dl_tasks: + task.join() + + if task.is_downloaded(): + task_h = task.user_info['h'] + task_v = task.user_info['v'] + if task_h > real_h: + real_h = task_h + if task_v > real_v: + real_v = task_v + + if not imghdr.what(task._save_as): + data_error = True + + # try: + # if not download_file(url, output_file, handle_http_errors=False): + # raise OSError('network failure') + # if not imghdr.what(output_file): + # data_error = True + # break + # real_v = v + # real_h = h + # safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK') + # + # except urllib.error.HTTPError: + # safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL') + # break if data_error: self.thumbnail_dl(page) @@ -272,10 +314,13 @@ def download_file(url, output, handle_http_errors=True) -> bool: def grab_magazine(url: str, output_root: str, probe_pages: Optional[list[int]] = None, - only_fetch=False, force_overwrite=False): - pub_date, collection_id, doc_id = retronews.parse_url(url) + probe_all=False, only_fetch=False, force_overwrite=False): + try: + pub_date, collection_id, doc_id = retronews.parse_url(url) + except AttributeError: + return False - data = retronews.api_doc_info(collection_id, doc_id) + data = retronews.doc_info(collection_id, doc_id) pages = int(data['nbPages']) print(f'found {pages} pages') @@ -283,7 +328,7 @@ def grab_magazine(url: str, if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')): if not force_overwrite: print(f'{y}-{m}-{d}.pdf already exists, not continuing') - return + return True else: os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf')) print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)') @@ -302,7 +347,8 @@ def grab_magazine(url: str, number=i+1, collection_id=collection_id, doc_id=doc_id, - probe_pages=probe_pages)) + probe_pages=probe_pages, + probe_all=probe_all)) for worker in pool: worker.start() @@ -312,10 +358,10 @@ def grab_magazine(url: str, with open(os.path.join(output_dir, 'error.txt'), 'w') as f: f.write(f'error: {worker.get_error()}') print(f'ERROR: failed to download {pub_date} magazine') - return + return False if only_fetch: - return + return True # merge tiles for page in range(pages): @@ -338,6 +384,8 @@ def grab_magazine(url: str, except: traceback.print_exc() + return True + if __name__ == '__main__': parser = ArgumentParser() @@ -354,6 +402,8 @@ if __name__ == '__main__': help='only fetch magazine tiles and exit, do not merge anything') parser.add_argument('--force-overwrite', action='store_true', help='if file yyyy-mm-dd.pdf already exists, delete it and start over') + parser.add_argument('--force-probe', action='store_true', + help='force all pages to use the \'probe\' method') parser.add_argument('--fetch-probe-pages', nargs='+', type=int, help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') @@ -371,11 +421,14 @@ if __name__ == '__main__': url = args.url while True: print(f'grabbing {url}...') - grab_magazine(url, - output_root=args.output, - probe_pages=args.fetch_probe_pages, - only_fetch=args.only_fetch, - force_overwrite=args.force_overwrite) + if not grab_magazine(url, + output_root=args.output, + probe_pages=args.fetch_probe_pages, + probe_all=args.force_probe, + only_fetch=args.only_fetch, + force_overwrite=args.force_overwrite): + logging.error('failed to grab') + break if not args.continue_prev and not args.continue_next: break @@ -383,11 +436,16 @@ if __name__ == '__main__': r = requests.get(url) try: + next_url = None if args.continue_next: next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] elif args.continue_prev: next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] + if not next_url: + if not next_url: + break + if next_url.startswith('/'): next_url = f'https://www.retronews.fr{next_url}' diff --git a/retronews/__init__.py b/retronews/__init__.py new file mode 100644 index 0000000..ae3b518 --- /dev/null +++ b/retronews/__init__.py @@ -0,0 +1,8 @@ +from .retronews import ( + convert_date, + parse_url, + doc_info, + page_info, + thumbnail_url, + tile_url +)
\ No newline at end of file diff --git a/retronews.py b/retronews/retronews.py index 6eaf120..9e80c58 100644 --- a/retronews.py +++ b/retronews/retronews.py @@ -9,6 +9,8 @@ MONTHS = dict( may=5, jun=6, jul=7, + juillet=7, + aout=8, aug=8, sep=9, oct=10, @@ -27,7 +29,7 @@ def convert_date(s: str) -> tuple[str, str, str]: def parse_url(url: str) -> tuple: - return re.search(r'/(?:mercure-de-france|le-nouveau-mercure|le-mercure-galant|mercure-francais|mercure-galant)/([^/]+)/(\d+)/(\d+)/', url).groups() + return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups() def doc_info(collection_id, doc_id): |