import sqlite3 import logging import os.path import retronews import threading from typing import Optional class Database: SCHEMA = 6 def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) file = os.path.join(os.path.dirname(__file__), 'mdf-retrobase.sqlite3') self.sqlite = sqlite3.connect(file, check_same_thread=False) self.lock = threading.Lock() sqlite_version = self._get_sqlite_version() self.logger.debug(f'SQLite version: {sqlite_version}') schema_version = self.schema_get_version() self.logger.debug(f'Schema version: {schema_version}') self.schema_init(schema_version) self.schema_set_version(self.SCHEMA) def __del__(self): if self.sqlite: self.sqlite.commit() self.sqlite.close() def _get_sqlite_version(self) -> str: cursor = self.sqlite.cursor() cursor.execute("SELECT sqlite_version()") return cursor.fetchone()[0] def schema_get_version(self) -> int: cursor = self.sqlite.execute('PRAGMA user_version') return int(cursor.fetchone()[0]) def schema_set_version(self, v) -> None: self.sqlite.execute('PRAGMA user_version={:d}'.format(v)) self.logger.info(f'Schema set to {v}') def cursor(self) -> sqlite3.Cursor: return self.sqlite.cursor() def commit(self) -> None: return self.sqlite.commit() def schema_init(self, version: int) -> None: cursor = self.cursor() if version < 1: # timestamps cursor.execute("""CREATE TABLE IF NOT EXISTS mdf_links ( issue_date TEXT PRIMARY KEY, url TEXT NOT NULL, pages INTEGER NOT NULL )""") if version < 2: cursor.execute("""CREATE TABLE IF NOT EXISTS mdf_pages ( collection_id INTEGER NOT NULL, doc_id INTEGER NOT NULL, page INTEGER NOT NULL, height INTEGER NOT NULL, width INTEGER NOT NULL, dpi INTEGER NOT NULL )""") cursor.execute("""CREATE UNIQUE INDEX mdf_pages_idx ON mdf_pages (collection_id, doc_id, page)""") if version < 3: cursor.execute("ALTER TABLE mdf_pages ADD fail INTEGER NOT NULL") if version < 4: cursor.execute("""CREATE INDEX mdf_pages_fail_idx ON mdf_pages (fail)""") if version < 5: for col in ('collection_id', 'doc_id'): cursor.execute(f"ALTER TABLE mdf_links ADD {col} INTEGER NOT NULL DEFAULT '0'") cursor.execute("CREATE INDEX mdf_links_col_doc_idx ON mdf_links (collection_id, doc_id)") if version < 6: cursor.execute("DROP INDEX mdf_links_col_doc_idx") cursor.execute("CREATE UNIQUE INDEX mdf_links_col_doc_idx ON mdf_links (collection_id, doc_id)") self.commit() def add_link(self, issue_date: str, url: str, pages: int): with self.lock: self.cursor().execute("REPLACE INTO mdf_links (issue_date, url, pages) VALUES (?, ?, ?)", (issue_date, url, str(pages))) self.commit() def add_page(self, collection_id: int, doc_id: int, page: int, width: int, height: int, dpi: int): with self.lock: self.cursor().execute("INSERT INTO mdf_pages (collection_id, doc_id, page, width, height, dpi, fail) VALUES (?, ?, ?, ?, ?, ?, 0)", (collection_id, doc_id, page, width, height, dpi)) self.commit() def update_page(self, collection_id: int, doc_id: int, page: int, width: int, height: int, dpi: int): with self.lock: self.cursor().execute("UPDATE mdf_pages SET width=?, height=?, dpi=?, fail=0 WHERE collection_id=? AND doc_id=? AND page=?", (width, height, dpi, collection_id, doc_id, page)) self.commit() def add_page_failed(self, collection_id, doc_id, page): with self.lock: self.cursor().execute("INSERT INTO mdf_pages (collection_id, doc_id, page, width, height, dpi, fail) VALUES (?, ?, ?, 0, 0, 0, 1)", (collection_id, doc_id, page)) self.commit() def get_existing_pages(self, fail=0): cur = self.cursor() cur.execute("SELECT collection_id, doc_id, page FROM mdf_pages WHERE fail=?", (fail,)) return cur.fetchall() def get_documents(self, range: Optional[tuple[str, str]] = None): cur = self.cursor() docs = [] sql = "SELECT issue_date, url, pages FROM mdf_links" if range: sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'" cur.execute(sql) for issue_date, url, pages in cur.fetchall(): pub_date, collection_id, doc_id = retronews.parse_url(url) docs.append(dict( collection_id=collection_id, doc_id=doc_id, pages=pages )) return docs def get_doc_pages(self, collection_id, doc_id): cur = self.cursor() cur.execute("SELECT page, width, height, dpi FROM mdf_pages WHERE collection_id=? AND doc_id=?", (collection_id, doc_id)) return cur.fetchall() def fix_documents(self): cur = self.cursor() cur.execute("SELECT issue_date, url FROM mdf_links") for issue_date, url in cur.fetchall(): pub_date, cid, did = retronews.parse_url(url) cur.execute("UPDATE mdf_links SET collection_id=?, doc_id=? WHERE issue_date=?", (cid, did, issue_date)) self.commit()