diff options
author | Evgeny Zinoviev <me@ch1p.io> | 2023-12-30 15:37:08 +0300 |
---|---|---|
committer | Evgeny Zinoviev <me@ch1p.io> | 2023-12-30 15:37:08 +0300 |
commit | abd1975def213891afdf0d87adbf79c2c7dbc0cb (patch) | |
tree | 4c0feb5ecc7f8dcf5c17e74a81a2bae9ba6c649f | |
parent | 38474234431fcf1dae50c00cdf649b4102c0faf9 (diff) |
-rwxr-xr-x | dl-from-db.py | 45 | ||||
-rwxr-xr-x | dl-retronews.py | 80 | ||||
-rw-r--r-- | mdf/__init__.py | 3 | ||||
-rw-r--r-- | mdf/database/__init__.py (renamed from database/__init__.py) | 0 | ||||
-rw-r--r-- | mdf/database/database.py (renamed from database/database.py) | 6 | ||||
-rw-r--r-- | mdf/retronews/__init__.py | 15 | ||||
-rw-r--r--[-rwxr-xr-x] | mdf/retronews/retronews.py (renamed from grab-retronews.py) | 214 | ||||
-rw-r--r-- | mdf/util/__init__.py | 0 | ||||
-rw-r--r-- | mdf/util/util.py | 44 | ||||
-rw-r--r-- | retronews/__init__.py | 8 | ||||
-rw-r--r-- | retronews/retronews.py | 50 |
11 files changed, 268 insertions, 197 deletions
diff --git a/dl-from-db.py b/dl-from-db.py new file mode 100755 index 0000000..526c3a4 --- /dev/null +++ b/dl-from-db.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) + +import logging + +from mdf import Database, retronews +from argparse import ArgumentParser + +database = Database() + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--output', type=str, required=True, + help='output directory') + parser.add_argument('--from-date', type=str) + parser.add_argument('--to-date', type=str) + parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int) + parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int) + parser.add_argument('--only-fetch', action='store_true', + help='only fetch magazine tiles and exit, do not merge anything') + parser.add_argument('--force-overwrite', action='store_true', + help='if file yyyy-mm-dd.pdf already exists, delete it and start over') + parser.add_argument('--force-probe', action='store_true', + help='force all pages to use the \'probe\' method') + parser.add_argument('--fetch-probe-pages', nargs='+', type=int, + help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') + + args = parser.parse_args() + + retronews.set_tile_merging_pool_size(args.merge_threads) + retronews.set_page_fetching_pool_size(args.fetch_threads) + + for doc in database.get_documents((args.from_date, args.to_date)): + url = doc['url'] + print(f'grabbing {url}...') + if not retronews.grab_magazine(url, + output_root=args.output, + probe_pages=args.fetch_probe_pages, + probe_all=args.force_probe, + only_fetch=args.only_fetch, + force_overwrite=args.force_overwrite): + logging.error(f'failed to grab {url}') diff --git a/dl-retronews.py b/dl-retronews.py new file mode 100755 index 0000000..6ee4325 --- /dev/null +++ b/dl-retronews.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +import warnings +warnings.filterwarnings("ignore", category=DeprecationWarning) + +import re +import requests +import logging + +from mdf import retronews +from argparse import ArgumentParser + + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--url', type=str, required=True) + parser.add_argument('--output', type=str, required=True, + help='output directory') + parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int) + parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int) + parser.add_argument('--continue-prev', action='store_true', + help='keep scrapping backwards in time') + parser.add_argument('--continue-next', action='store_true', + help='keep scrapping forwards in time') + parser.add_argument('--only-fetch', action='store_true', + help='only fetch magazine tiles and exit, do not merge anything') + parser.add_argument('--force-overwrite', action='store_true', + help='if file yyyy-mm-dd.pdf already exists, delete it and start over') + parser.add_argument('--force-probe', action='store_true', + help='force all pages to use the \'probe\' method') + parser.add_argument('--fetch-probe-pages', nargs='+', type=int, + help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') + + args = parser.parse_args() + + with_continuation = args.continue_prev or args.continue_next + if args.fetch_probe_pages and with_continuation: + raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack') + if args.only_fetch and with_continuation: + raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack') + + TILE_MERGING_POOL_SIZE = args.merge_threads + PAGE_FETCHING_POOL_SIZE = args.fetch_threads + + url = args.url + while True: + print(f'grabbing {url}...') + if not retronews.grab_magazine(url, + output_root=args.output, + probe_pages=args.fetch_probe_pages, + probe_all=args.force_probe, + only_fetch=args.only_fetch, + force_overwrite=args.force_overwrite): + logging.error('failed to grab') + break + + if not args.continue_prev and not args.continue_next: + break + + r = requests.get(url) + + try: + next_url = None + if args.continue_next: + next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] + elif args.continue_prev: + next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] + + if not next_url: + break + + if next_url.startswith('/'): + next_url = f'https://www.retronews.fr{next_url}' + + url = next_url + + except: + print('error: failed to find previous link! exiting') + break diff --git a/mdf/__init__.py b/mdf/__init__.py new file mode 100644 index 0000000..9466436 --- /dev/null +++ b/mdf/__init__.py @@ -0,0 +1,3 @@ +from .retronews import retronews +from .util import util +from .database import Database
\ No newline at end of file diff --git a/database/__init__.py b/mdf/database/__init__.py index ef3f969..ef3f969 100644 --- a/database/__init__.py +++ b/mdf/database/__init__.py diff --git a/database/database.py b/mdf/database/database.py index 04902f1..fd08e38 100644 --- a/database/database.py +++ b/mdf/database/database.py @@ -1,7 +1,7 @@ import sqlite3 import logging import os.path -import retronews +from ..retronews import retronews import threading from typing import Optional @@ -13,7 +13,7 @@ class Database: def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3') + file = os.path.join(os.path.dirname(__file__), '..', '..', 'mdf-retrobase.sqlite3') self.sqlite = sqlite3.connect(file, check_same_thread=False) self.lock = threading.Lock() @@ -125,10 +125,12 @@ class Database: sql = "SELECT issue_date, url, pages FROM mdf_links" if range: sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'" + sql += " ORDER BY issue_date" cur.execute(sql) for issue_date, url, pages in cur.fetchall(): pub_date, collection_id, doc_id = retronews.parse_url(url) docs.append(dict( + url=url, collection_id=collection_id, doc_id=doc_id, pages=pages diff --git a/mdf/retronews/__init__.py b/mdf/retronews/__init__.py new file mode 100644 index 0000000..105ca70 --- /dev/null +++ b/mdf/retronews/__init__.py @@ -0,0 +1,15 @@ +from .retronews import ( + convert_date, + parse_url, + _doc_info, + page_info, + thumbnail_url, + tile_url, + HTILES, + VTILES, + PAGE_FETCHING_POOL_SIZE, + TILE_MERGING_POOL_SIZE, + set_tile_merging_pool_size, + set_page_fetching_pool_size, + grab_magazine +)
\ No newline at end of file diff --git a/grab-retronews.py b/mdf/retronews/retronews.py index ac4dbf1..4697e55 100755..100644 --- a/grab-retronews.py +++ b/mdf/retronews/retronews.py @@ -1,53 +1,75 @@ -#!/usr/bin/env python3 -import logging -import warnings -warnings.filterwarnings("ignore", category=DeprecationWarning) - -import os -import sys -import json import re -import imghdr import requests -import urllib.request -import urllib.error -import http.client -import subprocess -import shutil +import imghdr +import json +import os import queue +import shutil import traceback -import retronews -import logging -from database import Database +from ..util.util import safe_print, download_file, run from typing import Optional -from threading import Thread, Lock -from time import sleep -from argparse import ArgumentParser +from threading import Thread +import urllib.error -warnings.filterwarnings("ignore", category=DeprecationWarning) +_pages_queue = queue.Queue() +_merging_queue = queue.Queue() VTILES = 3 HTILES = 2 TILE_MERGING_POOL_SIZE = 8 PAGE_FETCHING_POOL_SIZE = 8 -database = Database() -print_lock = Lock() -pages_queue = queue.Queue() -merging_queue = queue.Queue() +MONTHS = dict( + jan=1, + feb=2, + mar=3, + apr=4, + may=5, + jun=6, + jul=7, + juillet=7, + aout=8, + aug=8, + sep=9, + oct=10, + nov=11, + novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1 + dec=12 +) + + +def convert_date(s: str) -> tuple[str, str, str]: + m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups() + year = m[2] + month = '%02d' % MONTHS[m[1]] + day = m[0] + return year, month, day + + +def parse_url(url: str) -> tuple: + return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups() -def safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) +def _doc_info(collection_id, doc_id): + r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}') + return r.json() + + +def page_info(collection_id, doc_id, page): + r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/') + return r.json() + + +def thumbnail_url(collection_id, doc_id, page) -> str: + return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail' + + +def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str: + return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0' -def run(args: list, **kwargs): - p = subprocess.run(args, **kwargs) - if p.returncode != 0: - raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')') class DownloaderThread(Thread): @@ -92,9 +114,9 @@ class TileMergeWorker(Thread): def run(self): safe_print(f'[tile merger {self._number}] started') - while not merging_queue.empty(): + while not _merging_queue.empty(): try: - page = merging_queue.get_nowait() + page = _merging_queue.get_nowait() page_dir = os.path.join(self._working_dir, str(page)) thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg') meta_path = os.path.join(page_dir, 'meta.json') @@ -116,7 +138,7 @@ class TileMergeWorker(Thread): for h in range(htiles): vfiles = [] for v in range(vtiles): - vfiles.append(f'{h}x{v}.jpg') + vfiles.append(f'v{v}_h{h}.jpg') run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir) hfiles.append(f'{h}.jpg') @@ -153,12 +175,12 @@ class PageFetchWorker(Thread): page = 0 try: - while not pages_queue.empty(): + while not _pages_queue.empty(): try: - page = pages_queue.get_nowait() + page = _pages_queue.get_nowait() safe_print(f'[pf-{self._number}] page {page} started') - if self._probe_all or page in self._probe_pages: + if self._probe_all or (self._probe_pages is not None and page in self._probe_pages): self.probe_dl(page) else: try: @@ -172,7 +194,7 @@ class PageFetchWorker(Thread): except Exception as e: self._failed = True - self._error = f'while fetching page {page}: {str(e)}' + self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc() def _get_page_dir(self, page): page_dir = os.path.join(self._working_dir, str(page)) @@ -191,7 +213,7 @@ class PageFetchWorker(Thread): dl_tasks = [] for horiz_tile in range(HTILES): for vert_tile in range(VTILES): - url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile) + url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile) output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg' if os.path.isfile(output_file): if os.path.getsize(output_file) < 4: @@ -230,7 +252,7 @@ class PageFetchWorker(Thread): dl_tasks = [] for h in range(10): for v in range(10): - url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v) + url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v) output_file = f'{page_dir}/{h}x{v}.jpg' if os.path.isfile(output_file): safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY') @@ -283,48 +305,26 @@ class PageFetchWorker(Thread): def thumbnail_dl(self, page): page_dir = self._get_page_dir(page) - thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page) - if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')): - raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})') + thumb_url = thumbnail_url(self._collection_id, self._doc_id, page) + if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')): + raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})') safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail') -def download_file(url, output, handle_http_errors=True) -> bool: - tries_left = 3 - ok = False - while tries_left > 0: - try: - urllib.request.urlretrieve(url, output) - ok = True - break - except http.client.RemoteDisconnected: - ok = False - print(' caught an exception, sleeping for 2 seconds and retrying...') - sleep(2) - tries_left -= 1 - except urllib.error.HTTPError as e: - if not handle_http_errors: - raise e - else: - print(f' failed to download {url}: {str(e)}') - return False - return ok - - def grab_magazine(url: str, output_root: str, probe_pages: Optional[list[int]] = None, probe_all=False, only_fetch=False, force_overwrite=False): try: - pub_date, collection_id, doc_id = retronews.parse_url(url) + pub_date, collection_id, doc_id = parse_url(url) except AttributeError: return False - data = retronews.doc_info(collection_id, doc_id) + data = _doc_info(collection_id, doc_id) pages = int(data['nbPages']) print(f'found {pages} pages') - y, m, d = retronews.convert_date(pub_date) + y, m, d = convert_date(pub_date) if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')): if not force_overwrite: print(f'{y}-{m}-{d}.pdf already exists, not continuing') @@ -339,7 +339,7 @@ def grab_magazine(url: str, # fetch pages for page in range(pages): - pages_queue.put(page+1) + _pages_queue.put(page+1) pool = [] for i in range(PAGE_FETCHING_POOL_SIZE): @@ -366,7 +366,7 @@ def grab_magazine(url: str, # merge tiles for page in range(pages): page += 1 - merging_queue.put(page) + _merging_queue.put(page) pool = [] for i in range(TILE_MERGING_POOL_SIZE): @@ -387,71 +387,11 @@ def grab_magazine(url: str, return True -if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument('--url', type=str, required=True) - parser.add_argument('--output', type=str, required=True, - help='output directory') - parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int) - parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int) - parser.add_argument('--continue-prev', action='store_true', - help='keep scrapping backwards in time') - parser.add_argument('--continue-next', action='store_true', - help='keep scrapping forwards in time') - parser.add_argument('--only-fetch', action='store_true', - help='only fetch magazine tiles and exit, do not merge anything') - parser.add_argument('--force-overwrite', action='store_true', - help='if file yyyy-mm-dd.pdf already exists, delete it and start over') - parser.add_argument('--force-probe', action='store_true', - help='force all pages to use the \'probe\' method') - parser.add_argument('--fetch-probe-pages', nargs='+', type=int, - help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown') - - args = parser.parse_args() - - with_continuation = args.continue_prev or args.continue_next - if args.fetch_probe_pages and with_continuation: - raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack') - if args.only_fetch and with_continuation: - raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack') - - TILE_MERGING_POOL_SIZE = args.merge_threads - PAGE_FETCHING_POOL_SIZE = args.fetch_threads - - url = args.url - while True: - print(f'grabbing {url}...') - if not grab_magazine(url, - output_root=args.output, - probe_pages=args.fetch_probe_pages, - probe_all=args.force_probe, - only_fetch=args.only_fetch, - force_overwrite=args.force_overwrite): - logging.error('failed to grab') - break - - if not args.continue_prev and not args.continue_next: - break - - r = requests.get(url) - - try: - next_url = None - if args.continue_next: - next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0] - elif args.continue_prev: - next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0] - - if not next_url: - if not next_url: - break - - if next_url.startswith('/'): - next_url = f'https://www.retronews.fr{next_url}' - - url = next_url +def set_tile_merging_pool_size(size): + global TILE_MERGING_POOL_SIZE + TILE_MERGING_POOL_SIZE = size - except: - print('error: failed to find previous link! exiting') - break +def set_page_fetching_pool_size(size): + global PAGE_FETCHING_POOL_SIZE + PAGE_FETCHING_POOL_SIZE = size diff --git a/mdf/util/__init__.py b/mdf/util/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/mdf/util/__init__.py diff --git a/mdf/util/util.py b/mdf/util/util.py new file mode 100644 index 0000000..b233d88 --- /dev/null +++ b/mdf/util/util.py @@ -0,0 +1,44 @@ +import subprocess +import urllib.request +import urllib.error + +from time import sleep +from threading import Lock +import http.client + + +_print_lock = Lock() + + +def safe_print(*args, **kwargs): + with _print_lock: + print(*args, **kwargs) + + +def run(args: list, **kwargs): + p = subprocess.run(args, **kwargs) + if p.returncode != 0: + raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')') + + +def download_file(url, output, handle_http_errors=True) -> bool: + tries_left = 3 + ok = False + while tries_left > 0: + try: + urllib.request.urlretrieve(url, output) + ok = True + break + except http.client.RemoteDisconnected: + ok = False + print(' caught an exception, sleeping for 2 seconds and retrying...') + sleep(2) + tries_left -= 1 + except urllib.error.HTTPError as e: + if not handle_http_errors: + raise e + else: + print(f' failed to download {url}: {str(e)}') + return False + return ok + diff --git a/retronews/__init__.py b/retronews/__init__.py deleted file mode 100644 index ae3b518..0000000 --- a/retronews/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .retronews import ( - convert_date, - parse_url, - doc_info, - page_info, - thumbnail_url, - tile_url -)
\ No newline at end of file diff --git a/retronews/retronews.py b/retronews/retronews.py deleted file mode 100644 index 9e80c58..0000000 --- a/retronews/retronews.py +++ /dev/null @@ -1,50 +0,0 @@ -import re -import requests - -MONTHS = dict( - jan=1, - feb=2, - mar=3, - apr=4, - may=5, - jun=6, - jul=7, - juillet=7, - aout=8, - aug=8, - sep=9, - oct=10, - nov=11, - novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1 - dec=12 -) - - -def convert_date(s: str) -> tuple[str, str, str]: - m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups() - year = m[2] - month = '%02d' % MONTHS[m[1]] - day = m[0] - return year, month, day - - -def parse_url(url: str) -> tuple: - return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups() - - -def doc_info(collection_id, doc_id): - r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}') - return r.json() - - -def page_info(collection_id, doc_id, page): - r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/') - return r.json() - - -def thumbnail_url(collection_id, doc_id, page) -> str: - return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail' - - -def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str: - return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0' |