summaryrefslogtreecommitdiff
path: root/grab-retronews.py
diff options
context:
space:
mode:
Diffstat (limited to 'grab-retronews.py')
-rwxr-xr-xgrab-retronews.py132
1 files changed, 95 insertions, 37 deletions
diff --git a/grab-retronews.py b/grab-retronews.py
index fbd1b28..ac4dbf1 100755
--- a/grab-retronews.py
+++ b/grab-retronews.py
@@ -1,4 +1,8 @@
#!/usr/bin/env python3
+import logging
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
import os
import sys
import json
@@ -13,6 +17,7 @@ import shutil
import queue
import traceback
import retronews
+import logging
from database import Database
from typing import Optional
@@ -20,6 +25,8 @@ from threading import Thread, Lock
from time import sleep
from argparse import ArgumentParser
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
VTILES = 3
HTILES = 2
TILE_MERGING_POOL_SIZE = 8
@@ -47,18 +54,27 @@ class DownloaderThread(Thread):
_url: str
_save_as: str
_download_result: Optional[bool]
+ _handle_http: bool
+ user_info: dict
- def __init__(self, url: str, save_as: str, thread_name=None):
+ def __init__(self, url: str, save_as: str, thread_name=None, handle_http=False, user_info=None):
super().__init__()
+ if user_info is None:
+ user_info = {}
if thread_name:
self.name = thread_name
self._url = url
self._save_as = save_as
self._download_result = None
+ self._handle_http = handle_http
+ self.user_info = user_info
def run(self):
- self._download_result = download_file(self._url, self._save_as)
+ try:
+ self._download_result = download_file(self._url, self._save_as, handle_http_errors=not self._handle_http)
+ except urllib.error.HTTPError:
+ pass
def is_downloaded(self) -> bool:
return self._download_result is True
@@ -80,7 +96,7 @@ class TileMergeWorker(Thread):
try:
page = merging_queue.get_nowait()
page_dir = os.path.join(self._working_dir, str(page))
- thumbnail_path = os.path.join(self._working_dir, 'thumbnail.jpg')
+ thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
meta_path = os.path.join(page_dir, 'meta.json')
if os.path.exists(thumbnail_path):
@@ -100,12 +116,12 @@ class TileMergeWorker(Thread):
for h in range(htiles):
vfiles = []
for v in range(vtiles):
- vfiles.append(f'v{v}_h{h}.jpg')
- run(['convert', '-append', *vfiles, f'_v_{h}.jpg'], cwd=page_dir)
- hfiles.append(f'_v_{h}.jpg')
+ vfiles.append(f'{h}x{v}.jpg')
+ run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
+ hfiles.append(f'{h}.jpg')
run(['convert', '+append', *hfiles, os.path.join(self._working_dir, f'{page}.jpg')], cwd=page_dir)
- shutil.rmtree(page_dir)
+ # shutil.rmtree(page_dir)
safe_print(f'[tile merger {self._number}] page {page} done')
@@ -119,8 +135,9 @@ class PageFetchWorker(Thread):
_failed: bool
_error: Optional[str]
_probe_pages: Optional[list[int]]
+ _probe_all: bool
- def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None):
+ def __init__(self, working_dir: str, number: int, collection_id, doc_id, probe_pages: Optional[list[int]] = None, probe_all=False):
super().__init__()
self._working_dir = working_dir
self._number = number
@@ -129,6 +146,7 @@ class PageFetchWorker(Thread):
self._failed = False
self._error = None
self._probe_pages = probe_pages
+ self._probe_all = probe_all
def run(self):
safe_print(f'[pf-{self._number}] started')
@@ -140,7 +158,7 @@ class PageFetchWorker(Thread):
page = pages_queue.get_nowait()
safe_print(f'[pf-{self._number}] page {page} started')
- if page in self._probe_pages:
+ if self._probe_all or page in self._probe_pages:
self.probe_dl(page)
else:
try:
@@ -209,28 +227,52 @@ class PageFetchWorker(Thread):
real_h = 0
real_v = 0
data_error = False
- for h in range(5):
- for v in range(5):
+ dl_tasks = []
+ for h in range(10):
+ for v in range(10):
url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
- output_file = f'{page_dir}/v{v}_h{h}.jpg'
+ output_file = f'{page_dir}/{h}x{v}.jpg'
if os.path.isfile(output_file):
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
if os.path.getsize(output_file) < 4:
os.unlink(output_file)
continue
- try:
- if not download_file(url, output_file, handle_http_errors=False):
- raise OSError('network failure')
- if not imghdr.what(output_file):
- data_error = True
- break
- real_v = v
- real_h = h
- safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
-
- except urllib.error.HTTPError:
- safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
- break
+
+ dl_tasks.append(DownloaderThread(url=url,
+ save_as=os.path.join(page_dir, output_file),
+ handle_http=True,
+ thread_name=f'p{page}-v{v}-h{h}',
+ user_info=dict(h=h, v=v)))
+
+ for task in dl_tasks:
+ task.start()
+ for task in dl_tasks:
+ task.join()
+
+ if task.is_downloaded():
+ task_h = task.user_info['h']
+ task_v = task.user_info['v']
+ if task_h > real_h:
+ real_h = task_h
+ if task_v > real_v:
+ real_v = task_v
+
+ if not imghdr.what(task._save_as):
+ data_error = True
+
+ # try:
+ # if not download_file(url, output_file, handle_http_errors=False):
+ # raise OSError('network failure')
+ # if not imghdr.what(output_file):
+ # data_error = True
+ # break
+ # real_v = v
+ # real_h = h
+ # safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} OK')
+ #
+ # except urllib.error.HTTPError:
+ # safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} FAIL')
+ # break
if data_error:
self.thumbnail_dl(page)
@@ -272,10 +314,13 @@ def download_file(url, output, handle_http_errors=True) -> bool:
def grab_magazine(url: str,
output_root: str,
probe_pages: Optional[list[int]] = None,
- only_fetch=False, force_overwrite=False):
- pub_date, collection_id, doc_id = retronews.parse_url(url)
+ probe_all=False, only_fetch=False, force_overwrite=False):
+ try:
+ pub_date, collection_id, doc_id = retronews.parse_url(url)
+ except AttributeError:
+ return False
- data = retronews.api_doc_info(collection_id, doc_id)
+ data = retronews.doc_info(collection_id, doc_id)
pages = int(data['nbPages'])
print(f'found {pages} pages')
@@ -283,7 +328,7 @@ def grab_magazine(url: str,
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
if not force_overwrite:
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
- return
+ return True
else:
os.unlink(os.path.join(output_root, f'{y}-{m}-{d}.pdf'))
print(f'{y}-{m}-{d}.pdf already exists, deleting and continuing (force_overwrite=on)')
@@ -302,7 +347,8 @@ def grab_magazine(url: str,
number=i+1,
collection_id=collection_id,
doc_id=doc_id,
- probe_pages=probe_pages))
+ probe_pages=probe_pages,
+ probe_all=probe_all))
for worker in pool:
worker.start()
@@ -312,10 +358,10 @@ def grab_magazine(url: str,
with open(os.path.join(output_dir, 'error.txt'), 'w') as f:
f.write(f'error: {worker.get_error()}')
print(f'ERROR: failed to download {pub_date} magazine')
- return
+ return False
if only_fetch:
- return
+ return True
# merge tiles
for page in range(pages):
@@ -338,6 +384,8 @@ def grab_magazine(url: str,
except:
traceback.print_exc()
+ return True
+
if __name__ == '__main__':
parser = ArgumentParser()
@@ -354,6 +402,8 @@ if __name__ == '__main__':
help='only fetch magazine tiles and exit, do not merge anything')
parser.add_argument('--force-overwrite', action='store_true',
help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
+ parser.add_argument('--force-probe', action='store_true',
+ help='force all pages to use the \'probe\' method')
parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
@@ -371,11 +421,14 @@ if __name__ == '__main__':
url = args.url
while True:
print(f'grabbing {url}...')
- grab_magazine(url,
- output_root=args.output,
- probe_pages=args.fetch_probe_pages,
- only_fetch=args.only_fetch,
- force_overwrite=args.force_overwrite)
+ if not grab_magazine(url,
+ output_root=args.output,
+ probe_pages=args.fetch_probe_pages,
+ probe_all=args.force_probe,
+ only_fetch=args.only_fetch,
+ force_overwrite=args.force_overwrite):
+ logging.error('failed to grab')
+ break
if not args.continue_prev and not args.continue_next:
break
@@ -383,11 +436,16 @@ if __name__ == '__main__':
r = requests.get(url)
try:
+ next_url = None
if args.continue_next:
next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
elif args.continue_prev:
next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
+ if not next_url:
+ if not next_url:
+ break
+
if next_url.startswith('/'):
next_url = f'https://www.retronews.fr{next_url}'