diff options
author | Evgeny Zinoviev <me@ch1p.io> | 2023-02-02 07:50:34 +0300 |
---|---|---|
committer | Evgeny Zinoviev <me@ch1p.io> | 2023-02-02 07:50:34 +0300 |
commit | 209c6404eb274c9f31e6ce847d365ca82d1bdac9 (patch) | |
tree | 907ff5b651fd18c22eb7a9460a740ecff8f36c46 /fill-pages-info.py |
initial
Diffstat (limited to 'fill-pages-info.py')
-rwxr-xr-x | fill-pages-info.py | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/fill-pages-info.py b/fill-pages-info.py new file mode 100755 index 0000000..f357f95 --- /dev/null +++ b/fill-pages-info.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +import retronews +import threading +import queue +import sqlite3 + +from database import Database +from argparse import ArgumentParser + +db = Database() +print_lock = threading.Lock() +ok_lock = threading.Lock() +fail_lock = threading.Lock() +tasks_queue = queue.Queue() + +done_ok = 0 +done_fail = 0 + + +def incr_ok(): + global done_ok + with ok_lock: + done_ok += 1 + print_state() + + +def incr_fail(): + global done_fail + with fail_lock: + done_fail += 1 + print_state() + + +def print_state(): + with print_lock: + print(f'ok={done_ok} fail={done_fail}') + + +class PageWorker(threading.Thread): + _do_update: bool + + def __init__(self, do_update: bool): + super().__init__() + self._do_update = do_update + + def run(self): + while not tasks_queue.empty(): + try: + collection_id, doc_id, page = tasks_queue.get_nowait() + try: + info = retronews.page_info(collection_id, doc_id, page) + try: + f = getattr(db, 'add_page' if not self._do_update else 'update_page') + f(collection_id, doc_id, page, info['width'], info['height'], info['dpi']) + except sqlite3.IntegrityError: + with print_lock: + print(f'error: unique failed for ({collection_id}, {doc_id}, {page})') + incr_ok() + except: + # traceback.print_exc() + if self._do_update: + with print_lock: + print(f'error: skipping updating the page ({collection_id}, {doc_id}, {page}) cause failed again') + else: + db.add_page_failed(collection_id, doc_id, page) + incr_fail() + + except queue.Empty: + break + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--threads', type=int, required=True) + parser.add_argument('--fails', action='store_true') + args = parser.parse_args() + + if args.fails: + pages = db.get_existing_pages(fail=1) + for cid, did, page in pages: + tasks_queue.put((cid, did, page)) + pages = None + else: + ex_pages = db.get_existing_pages() + ex_map = {} + for cid, did, page in ex_pages: + ex_map[f'{cid}_{did}_{page}'] = 1 + + docs = db.get_documents() + for doc in docs: + for page in range(doc['pages']): + page += 1 + if f"{doc['collection_id']}_{doc['doc_id']}_{page}" not in ex_map: + tasks_queue.put((doc['collection_id'], doc['doc_id'], page)) + + ex_pages = None + ex_map = None + docs = None + + pool = [] + for i in range(args.threads): + pool.append(PageWorker(do_update=args.fails)) + for t in pool: + t.start() + for t in pool: + t.join() |