summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvgeny Zinoviev <me@ch1p.io>2023-12-30 15:37:08 +0300
committerEvgeny Zinoviev <me@ch1p.io>2023-12-30 15:37:08 +0300
commitabd1975def213891afdf0d87adbf79c2c7dbc0cb (patch)
tree4c0feb5ecc7f8dcf5c17e74a81a2bae9ba6c649f
parent38474234431fcf1dae50c00cdf649b4102c0faf9 (diff)
-rwxr-xr-xdl-from-db.py45
-rwxr-xr-xdl-retronews.py80
-rw-r--r--mdf/__init__.py3
-rw-r--r--mdf/database/__init__.py (renamed from database/__init__.py)0
-rw-r--r--mdf/database/database.py (renamed from database/database.py)6
-rw-r--r--mdf/retronews/__init__.py15
-rw-r--r--[-rwxr-xr-x]mdf/retronews/retronews.py (renamed from grab-retronews.py)214
-rw-r--r--mdf/util/__init__.py0
-rw-r--r--mdf/util/util.py44
-rw-r--r--retronews/__init__.py8
-rw-r--r--retronews/retronews.py50
11 files changed, 268 insertions, 197 deletions
diff --git a/dl-from-db.py b/dl-from-db.py
new file mode 100755
index 0000000..526c3a4
--- /dev/null
+++ b/dl-from-db.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+import logging
+
+from mdf import Database, retronews
+from argparse import ArgumentParser
+
+database = Database()
+
+
+if __name__ == '__main__':
+ parser = ArgumentParser()
+ parser.add_argument('--output', type=str, required=True,
+ help='output directory')
+ parser.add_argument('--from-date', type=str)
+ parser.add_argument('--to-date', type=str)
+ parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
+ parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
+ parser.add_argument('--only-fetch', action='store_true',
+ help='only fetch magazine tiles and exit, do not merge anything')
+ parser.add_argument('--force-overwrite', action='store_true',
+ help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
+ parser.add_argument('--force-probe', action='store_true',
+ help='force all pages to use the \'probe\' method')
+ parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
+ help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
+
+ args = parser.parse_args()
+
+ retronews.set_tile_merging_pool_size(args.merge_threads)
+ retronews.set_page_fetching_pool_size(args.fetch_threads)
+
+ for doc in database.get_documents((args.from_date, args.to_date)):
+ url = doc['url']
+ print(f'grabbing {url}...')
+ if not retronews.grab_magazine(url,
+ output_root=args.output,
+ probe_pages=args.fetch_probe_pages,
+ probe_all=args.force_probe,
+ only_fetch=args.only_fetch,
+ force_overwrite=args.force_overwrite):
+ logging.error(f'failed to grab {url}')
diff --git a/dl-retronews.py b/dl-retronews.py
new file mode 100755
index 0000000..6ee4325
--- /dev/null
+++ b/dl-retronews.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+import re
+import requests
+import logging
+
+from mdf import retronews
+from argparse import ArgumentParser
+
+
+
+if __name__ == '__main__':
+ parser = ArgumentParser()
+ parser.add_argument('--url', type=str, required=True)
+ parser.add_argument('--output', type=str, required=True,
+ help='output directory')
+ parser.add_argument('--merge-threads', default=retronews.TILE_MERGING_POOL_SIZE, type=int)
+ parser.add_argument('--fetch-threads', default=retronews.PAGE_FETCHING_POOL_SIZE, type=int)
+ parser.add_argument('--continue-prev', action='store_true',
+ help='keep scrapping backwards in time')
+ parser.add_argument('--continue-next', action='store_true',
+ help='keep scrapping forwards in time')
+ parser.add_argument('--only-fetch', action='store_true',
+ help='only fetch magazine tiles and exit, do not merge anything')
+ parser.add_argument('--force-overwrite', action='store_true',
+ help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
+ parser.add_argument('--force-probe', action='store_true',
+ help='force all pages to use the \'probe\' method')
+ parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
+ help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
+
+ args = parser.parse_args()
+
+ with_continuation = args.continue_prev or args.continue_next
+ if args.fetch_probe_pages and with_continuation:
+ raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
+ if args.only_fetch and with_continuation:
+ raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
+
+ TILE_MERGING_POOL_SIZE = args.merge_threads
+ PAGE_FETCHING_POOL_SIZE = args.fetch_threads
+
+ url = args.url
+ while True:
+ print(f'grabbing {url}...')
+ if not retronews.grab_magazine(url,
+ output_root=args.output,
+ probe_pages=args.fetch_probe_pages,
+ probe_all=args.force_probe,
+ only_fetch=args.only_fetch,
+ force_overwrite=args.force_overwrite):
+ logging.error('failed to grab')
+ break
+
+ if not args.continue_prev and not args.continue_next:
+ break
+
+ r = requests.get(url)
+
+ try:
+ next_url = None
+ if args.continue_next:
+ next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
+ elif args.continue_prev:
+ next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
+
+ if not next_url:
+ break
+
+ if next_url.startswith('/'):
+ next_url = f'https://www.retronews.fr{next_url}'
+
+ url = next_url
+
+ except:
+ print('error: failed to find previous link! exiting')
+ break
diff --git a/mdf/__init__.py b/mdf/__init__.py
new file mode 100644
index 0000000..9466436
--- /dev/null
+++ b/mdf/__init__.py
@@ -0,0 +1,3 @@
+from .retronews import retronews
+from .util import util
+from .database import Database \ No newline at end of file
diff --git a/database/__init__.py b/mdf/database/__init__.py
index ef3f969..ef3f969 100644
--- a/database/__init__.py
+++ b/mdf/database/__init__.py
diff --git a/database/database.py b/mdf/database/database.py
index 04902f1..fd08e38 100644
--- a/database/database.py
+++ b/mdf/database/database.py
@@ -1,7 +1,7 @@
import sqlite3
import logging
import os.path
-import retronews
+from ..retronews import retronews
import threading
from typing import Optional
@@ -13,7 +13,7 @@ class Database:
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
- file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3')
+ file = os.path.join(os.path.dirname(__file__), '..', '..', 'mdf-retrobase.sqlite3')
self.sqlite = sqlite3.connect(file, check_same_thread=False)
self.lock = threading.Lock()
@@ -125,10 +125,12 @@ class Database:
sql = "SELECT issue_date, url, pages FROM mdf_links"
if range:
sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'"
+ sql += " ORDER BY issue_date"
cur.execute(sql)
for issue_date, url, pages in cur.fetchall():
pub_date, collection_id, doc_id = retronews.parse_url(url)
docs.append(dict(
+ url=url,
collection_id=collection_id,
doc_id=doc_id,
pages=pages
diff --git a/mdf/retronews/__init__.py b/mdf/retronews/__init__.py
new file mode 100644
index 0000000..105ca70
--- /dev/null
+++ b/mdf/retronews/__init__.py
@@ -0,0 +1,15 @@
+from .retronews import (
+ convert_date,
+ parse_url,
+ _doc_info,
+ page_info,
+ thumbnail_url,
+ tile_url,
+ HTILES,
+ VTILES,
+ PAGE_FETCHING_POOL_SIZE,
+ TILE_MERGING_POOL_SIZE,
+ set_tile_merging_pool_size,
+ set_page_fetching_pool_size,
+ grab_magazine
+) \ No newline at end of file
diff --git a/grab-retronews.py b/mdf/retronews/retronews.py
index ac4dbf1..4697e55 100755..100644
--- a/grab-retronews.py
+++ b/mdf/retronews/retronews.py
@@ -1,53 +1,75 @@
-#!/usr/bin/env python3
-import logging
-import warnings
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-
-import os
-import sys
-import json
import re
-import imghdr
import requests
-import urllib.request
-import urllib.error
-import http.client
-import subprocess
-import shutil
+import imghdr
+import json
+import os
import queue
+import shutil
import traceback
-import retronews
-import logging
-from database import Database
+from ..util.util import safe_print, download_file, run
from typing import Optional
-from threading import Thread, Lock
-from time import sleep
-from argparse import ArgumentParser
+from threading import Thread
+import urllib.error
-warnings.filterwarnings("ignore", category=DeprecationWarning)
+_pages_queue = queue.Queue()
+_merging_queue = queue.Queue()
VTILES = 3
HTILES = 2
TILE_MERGING_POOL_SIZE = 8
PAGE_FETCHING_POOL_SIZE = 8
-database = Database()
-print_lock = Lock()
-pages_queue = queue.Queue()
-merging_queue = queue.Queue()
+MONTHS = dict(
+ jan=1,
+ feb=2,
+ mar=3,
+ apr=4,
+ may=5,
+ jun=6,
+ jul=7,
+ juillet=7,
+ aout=8,
+ aug=8,
+ sep=9,
+ oct=10,
+ nov=11,
+ novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
+ dec=12
+)
+
+
+def convert_date(s: str) -> tuple[str, str, str]:
+ m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
+ year = m[2]
+ month = '%02d' % MONTHS[m[1]]
+ day = m[0]
+ return year, month, day
+
+
+def parse_url(url: str) -> tuple:
+ return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
-def safe_print(*args, **kwargs):
- with print_lock:
- print(*args, **kwargs)
+def _doc_info(collection_id, doc_id):
+ r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
+ return r.json()
+
+
+def page_info(collection_id, doc_id, page):
+ r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
+ return r.json()
+
+
+def thumbnail_url(collection_id, doc_id, page) -> str:
+ return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
+
+
+def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
+ return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'
-def run(args: list, **kwargs):
- p = subprocess.run(args, **kwargs)
- if p.returncode != 0:
- raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
class DownloaderThread(Thread):
@@ -92,9 +114,9 @@ class TileMergeWorker(Thread):
def run(self):
safe_print(f'[tile merger {self._number}] started')
- while not merging_queue.empty():
+ while not _merging_queue.empty():
try:
- page = merging_queue.get_nowait()
+ page = _merging_queue.get_nowait()
page_dir = os.path.join(self._working_dir, str(page))
thumbnail_path = os.path.join(page_dir, 'thumbnail.jpg')
meta_path = os.path.join(page_dir, 'meta.json')
@@ -116,7 +138,7 @@ class TileMergeWorker(Thread):
for h in range(htiles):
vfiles = []
for v in range(vtiles):
- vfiles.append(f'{h}x{v}.jpg')
+ vfiles.append(f'v{v}_h{h}.jpg')
run(['convert', '-append', *vfiles, f'{h}.jpg'], cwd=page_dir)
hfiles.append(f'{h}.jpg')
@@ -153,12 +175,12 @@ class PageFetchWorker(Thread):
page = 0
try:
- while not pages_queue.empty():
+ while not _pages_queue.empty():
try:
- page = pages_queue.get_nowait()
+ page = _pages_queue.get_nowait()
safe_print(f'[pf-{self._number}] page {page} started')
- if self._probe_all or page in self._probe_pages:
+ if self._probe_all or (self._probe_pages is not None and page in self._probe_pages):
self.probe_dl(page)
else:
try:
@@ -172,7 +194,7 @@ class PageFetchWorker(Thread):
except Exception as e:
self._failed = True
- self._error = f'while fetching page {page}: {str(e)}'
+ self._error = f'while fetching page {page}: {str(e)}' + traceback.format_exc()
def _get_page_dir(self, page):
page_dir = os.path.join(self._working_dir, str(page))
@@ -191,7 +213,7 @@ class PageFetchWorker(Thread):
dl_tasks = []
for horiz_tile in range(HTILES):
for vert_tile in range(VTILES):
- url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
+ url = tile_url(self._collection_id, self._doc_id, page, h_tile=horiz_tile, v_tile=vert_tile)
output_file = f'{page_dir}/v{vert_tile}_h{horiz_tile}.jpg'
if os.path.isfile(output_file):
if os.path.getsize(output_file) < 4:
@@ -230,7 +252,7 @@ class PageFetchWorker(Thread):
dl_tasks = []
for h in range(10):
for v in range(10):
- url = retronews.tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
+ url = tile_url(self._collection_id, self._doc_id, page, h_tile=h, v_tile=v)
output_file = f'{page_dir}/{h}x{v}.jpg'
if os.path.isfile(output_file):
safe_print(f'[pf-{self._number}] probing page {page}: v={v} h={h} ALREADY')
@@ -283,48 +305,26 @@ class PageFetchWorker(Thread):
def thumbnail_dl(self, page):
page_dir = self._get_page_dir(page)
- thumbnail_url = retronews.thumbnail_url(self._collection_id, self._doc_id, page)
- if not download_file(thumbnail_url, os.path.join(page_dir, 'thumbnail.jpg')):
- raise RuntimeError(f'network error, failed to download thumbnail ({thumbnail_url})')
+ thumb_url = thumbnail_url(self._collection_id, self._doc_id, page)
+ if not download_file(thumb_url, os.path.join(page_dir, 'thumbnail.jpg')):
+ raise RuntimeError(f'network error, failed to download thumbnail ({thumb_url})')
safe_print(f'[pf-{self._number}] page {page}: corrupt files; replaced with a thumbnail')
-def download_file(url, output, handle_http_errors=True) -> bool:
- tries_left = 3
- ok = False
- while tries_left > 0:
- try:
- urllib.request.urlretrieve(url, output)
- ok = True
- break
- except http.client.RemoteDisconnected:
- ok = False
- print(' caught an exception, sleeping for 2 seconds and retrying...')
- sleep(2)
- tries_left -= 1
- except urllib.error.HTTPError as e:
- if not handle_http_errors:
- raise e
- else:
- print(f' failed to download {url}: {str(e)}')
- return False
- return ok
-
-
def grab_magazine(url: str,
output_root: str,
probe_pages: Optional[list[int]] = None,
probe_all=False, only_fetch=False, force_overwrite=False):
try:
- pub_date, collection_id, doc_id = retronews.parse_url(url)
+ pub_date, collection_id, doc_id = parse_url(url)
except AttributeError:
return False
- data = retronews.doc_info(collection_id, doc_id)
+ data = _doc_info(collection_id, doc_id)
pages = int(data['nbPages'])
print(f'found {pages} pages')
- y, m, d = retronews.convert_date(pub_date)
+ y, m, d = convert_date(pub_date)
if os.path.exists(os.path.join(output_root, f'{y}-{m}-{d}.pdf')):
if not force_overwrite:
print(f'{y}-{m}-{d}.pdf already exists, not continuing')
@@ -339,7 +339,7 @@ def grab_magazine(url: str,
# fetch pages
for page in range(pages):
- pages_queue.put(page+1)
+ _pages_queue.put(page+1)
pool = []
for i in range(PAGE_FETCHING_POOL_SIZE):
@@ -366,7 +366,7 @@ def grab_magazine(url: str,
# merge tiles
for page in range(pages):
page += 1
- merging_queue.put(page)
+ _merging_queue.put(page)
pool = []
for i in range(TILE_MERGING_POOL_SIZE):
@@ -387,71 +387,11 @@ def grab_magazine(url: str,
return True
-if __name__ == '__main__':
- parser = ArgumentParser()
- parser.add_argument('--url', type=str, required=True)
- parser.add_argument('--output', type=str, required=True,
- help='output directory')
- parser.add_argument('--merge-threads', default=TILE_MERGING_POOL_SIZE, type=int)
- parser.add_argument('--fetch-threads', default=PAGE_FETCHING_POOL_SIZE, type=int)
- parser.add_argument('--continue-prev', action='store_true',
- help='keep scrapping backwards in time')
- parser.add_argument('--continue-next', action='store_true',
- help='keep scrapping forwards in time')
- parser.add_argument('--only-fetch', action='store_true',
- help='only fetch magazine tiles and exit, do not merge anything')
- parser.add_argument('--force-overwrite', action='store_true',
- help='if file yyyy-mm-dd.pdf already exists, delete it and start over')
- parser.add_argument('--force-probe', action='store_true',
- help='force all pages to use the \'probe\' method')
- parser.add_argument('--fetch-probe-pages', nargs='+', type=int,
- help='force some pages to use the \'probe\' method, when count of vertical and horizontal tiles is unknown')
-
- args = parser.parse_args()
-
- with_continuation = args.continue_prev or args.continue_next
- if args.fetch_probe_pages and with_continuation:
- raise RuntimeError('--fetch-probe-pages cannot be used together with --continue-* options, it\'s a one time hack')
- if args.only_fetch and with_continuation:
- raise RuntimeError('--only-fetch cannot be used together with --continue-* options, it\'s a one time hack')
-
- TILE_MERGING_POOL_SIZE = args.merge_threads
- PAGE_FETCHING_POOL_SIZE = args.fetch_threads
-
- url = args.url
- while True:
- print(f'grabbing {url}...')
- if not grab_magazine(url,
- output_root=args.output,
- probe_pages=args.fetch_probe_pages,
- probe_all=args.force_probe,
- only_fetch=args.only_fetch,
- force_overwrite=args.force_overwrite):
- logging.error('failed to grab')
- break
-
- if not args.continue_prev and not args.continue_next:
- break
-
- r = requests.get(url)
-
- try:
- next_url = None
- if args.continue_next:
- next_url = re.search(r'<a class="float-right pt-4 text-secondary" href="([^"]+)">SUIVANT', r.text, re.S).groups()[0]
- elif args.continue_prev:
- next_url = re.search(r'<a class="float-left pt-4 text-secondary" href="([^"]+)"><i class="fa fa-chevron-left">\s+</i>\s+PRÉCÉDENT</a>', r.text, re.S).groups()[0]
-
- if not next_url:
- if not next_url:
- break
-
- if next_url.startswith('/'):
- next_url = f'https://www.retronews.fr{next_url}'
-
- url = next_url
+def set_tile_merging_pool_size(size):
+ global TILE_MERGING_POOL_SIZE
+ TILE_MERGING_POOL_SIZE = size
- except:
- print('error: failed to find previous link! exiting')
- break
+def set_page_fetching_pool_size(size):
+ global PAGE_FETCHING_POOL_SIZE
+ PAGE_FETCHING_POOL_SIZE = size
diff --git a/mdf/util/__init__.py b/mdf/util/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/mdf/util/__init__.py
diff --git a/mdf/util/util.py b/mdf/util/util.py
new file mode 100644
index 0000000..b233d88
--- /dev/null
+++ b/mdf/util/util.py
@@ -0,0 +1,44 @@
+import subprocess
+import urllib.request
+import urllib.error
+
+from time import sleep
+from threading import Lock
+import http.client
+
+
+_print_lock = Lock()
+
+
+def safe_print(*args, **kwargs):
+ with _print_lock:
+ print(*args, **kwargs)
+
+
+def run(args: list, **kwargs):
+ p = subprocess.run(args, **kwargs)
+ if p.returncode != 0:
+ raise OSError(f'convert returned {p.returncode} ('+' '.join(args)+')')
+
+
+def download_file(url, output, handle_http_errors=True) -> bool:
+ tries_left = 3
+ ok = False
+ while tries_left > 0:
+ try:
+ urllib.request.urlretrieve(url, output)
+ ok = True
+ break
+ except http.client.RemoteDisconnected:
+ ok = False
+ print(' caught an exception, sleeping for 2 seconds and retrying...')
+ sleep(2)
+ tries_left -= 1
+ except urllib.error.HTTPError as e:
+ if not handle_http_errors:
+ raise e
+ else:
+ print(f' failed to download {url}: {str(e)}')
+ return False
+ return ok
+
diff --git a/retronews/__init__.py b/retronews/__init__.py
deleted file mode 100644
index ae3b518..0000000
--- a/retronews/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .retronews import (
- convert_date,
- parse_url,
- doc_info,
- page_info,
- thumbnail_url,
- tile_url
-) \ No newline at end of file
diff --git a/retronews/retronews.py b/retronews/retronews.py
deleted file mode 100644
index 9e80c58..0000000
--- a/retronews/retronews.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import re
-import requests
-
-MONTHS = dict(
- jan=1,
- feb=2,
- mar=3,
- apr=4,
- may=5,
- jun=6,
- jul=7,
- juillet=7,
- aout=8,
- aug=8,
- sep=9,
- oct=10,
- nov=11,
- novembre=11, # https://www.retronews.fr/journal/mercure-de-france/15-novembre-1905/118/2617647/1
- dec=12
-)
-
-
-def convert_date(s: str) -> tuple[str, str, str]:
- m = re.match(r'^(\d{2})-(.*?)-(\d{4})$', s).groups()
- year = m[2]
- month = '%02d' % MONTHS[m[1]]
- day = m[0]
- return year, month, day
-
-
-def parse_url(url: str) -> tuple:
- return re.search(r'/(?:[\-\d\w]+)/([^/]+)/(\d+)/(\d+)/', url).groups()
-
-
-def doc_info(collection_id, doc_id):
- r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}')
- return r.json()
-
-
-def page_info(collection_id, doc_id, page):
- r = requests.get(f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/')
- return r.json()
-
-
-def thumbnail_url(collection_id, doc_id, page) -> str:
- return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/thumbnail'
-
-
-def tile_url(collection_id, doc_id, page, v_tile, h_tile) -> str:
- return f'https://pv5web.retronews.fr/api/document/{collection_id}/{doc_id}/page/{page}/tile/{h_tile}/{v_tile}/0'