diff options
Diffstat (limited to 'database')
-rw-r--r-- | database/__init__.py | 1 | ||||
-rw-r--r-- | database/database.py | 152 |
2 files changed, 153 insertions, 0 deletions
diff --git a/database/__init__.py b/database/__init__.py new file mode 100644 index 0000000..ef3f969 --- /dev/null +++ b/database/__init__.py @@ -0,0 +1 @@ +from .database import Database diff --git a/database/database.py b/database/database.py new file mode 100644 index 0000000..04902f1 --- /dev/null +++ b/database/database.py @@ -0,0 +1,152 @@ +import sqlite3 +import logging +import os.path +import retronews +import threading + +from typing import Optional + + +class Database: + SCHEMA = 6 + + def __init__(self): + self.logger = logging.getLogger(self.__class__.__name__) + + file = os.path.join(os.path.dirname(__file__), '..', 'mdf-retrobase.sqlite3') + self.sqlite = sqlite3.connect(file, check_same_thread=False) + self.lock = threading.Lock() + + sqlite_version = self._get_sqlite_version() + self.logger.debug(f'SQLite version: {sqlite_version}') + + schema_version = self.schema_get_version() + self.logger.debug(f'Schema version: {schema_version}') + + self.schema_init(schema_version) + self.schema_set_version(self.SCHEMA) + + def __del__(self): + if self.sqlite: + self.sqlite.commit() + self.sqlite.close() + + def _get_sqlite_version(self) -> str: + cursor = self.sqlite.cursor() + cursor.execute("SELECT sqlite_version()") + return cursor.fetchone()[0] + + def schema_get_version(self) -> int: + cursor = self.sqlite.execute('PRAGMA user_version') + return int(cursor.fetchone()[0]) + + def schema_set_version(self, v) -> None: + self.sqlite.execute('PRAGMA user_version={:d}'.format(v)) + self.logger.info(f'Schema set to {v}') + + def cursor(self) -> sqlite3.Cursor: + return self.sqlite.cursor() + + def commit(self) -> None: + return self.sqlite.commit() + + def schema_init(self, version: int) -> None: + cursor = self.cursor() + + if version < 1: + # timestamps + cursor.execute("""CREATE TABLE IF NOT EXISTS mdf_links ( + issue_date TEXT PRIMARY KEY, + url TEXT NOT NULL, + pages INTEGER NOT NULL + )""") + + if version < 2: + cursor.execute("""CREATE TABLE IF NOT EXISTS mdf_pages ( + collection_id INTEGER NOT NULL, + doc_id INTEGER NOT NULL, + page INTEGER NOT NULL, + height INTEGER NOT NULL, + width INTEGER NOT NULL, + dpi INTEGER NOT NULL + )""") + cursor.execute("""CREATE UNIQUE INDEX mdf_pages_idx ON mdf_pages (collection_id, doc_id, page)""") + + if version < 3: + cursor.execute("ALTER TABLE mdf_pages ADD fail INTEGER NOT NULL") + + if version < 4: + cursor.execute("""CREATE INDEX mdf_pages_fail_idx ON mdf_pages (fail)""") + + if version < 5: + for col in ('collection_id', 'doc_id'): + cursor.execute(f"ALTER TABLE mdf_links ADD {col} INTEGER NOT NULL DEFAULT '0'") + cursor.execute("CREATE INDEX mdf_links_col_doc_idx ON mdf_links (collection_id, doc_id)") + + if version < 6: + cursor.execute("DROP INDEX mdf_links_col_doc_idx") + cursor.execute("CREATE UNIQUE INDEX mdf_links_col_doc_idx ON mdf_links (collection_id, doc_id)") + + self.commit() + + def add_link(self, issue_date: str, url: str, pages: int): + with self.lock: + self.cursor().execute("REPLACE INTO mdf_links (issue_date, url, pages) VALUES (?, ?, ?)", + (issue_date, url, str(pages))) + self.commit() + + def add_page(self, collection_id: int, doc_id: int, page: int, width: int, height: int, dpi: int): + with self.lock: + self.cursor().execute("INSERT INTO mdf_pages (collection_id, doc_id, page, width, height, dpi, fail) VALUES (?, ?, ?, ?, ?, ?, 0)", + (collection_id, doc_id, page, width, height, dpi)) + self.commit() + + def update_page(self, collection_id: int, doc_id: int, page: int, width: int, height: int, dpi: int): + with self.lock: + self.cursor().execute("UPDATE mdf_pages SET width=?, height=?, dpi=?, fail=0 WHERE collection_id=? AND doc_id=? AND page=?", + (width, height, dpi, collection_id, doc_id, page)) + self.commit() + + def add_page_failed(self, collection_id, doc_id, page): + with self.lock: + self.cursor().execute("INSERT INTO mdf_pages (collection_id, doc_id, page, width, height, dpi, fail) VALUES (?, ?, ?, 0, 0, 0, 1)", + (collection_id, doc_id, page)) + self.commit() + + def get_existing_pages(self, fail=0): + cur = self.cursor() + cur.execute("SELECT collection_id, doc_id, page FROM mdf_pages WHERE fail=?", (fail,)) + return cur.fetchall() + + def get_documents(self, range: Optional[tuple[str, str]] = None): + cur = self.cursor() + docs = [] + + sql = "SELECT issue_date, url, pages FROM mdf_links" + if range: + sql += f" WHERE issue_date BETWEEN '{range[0]}' AND '{range[1]}'" + cur.execute(sql) + for issue_date, url, pages in cur.fetchall(): + pub_date, collection_id, doc_id = retronews.parse_url(url) + docs.append(dict( + collection_id=collection_id, + doc_id=doc_id, + pages=pages + )) + + return docs + + def get_doc_pages(self, collection_id, doc_id): + cur = self.cursor() + cur.execute("SELECT page, width, height, dpi FROM mdf_pages WHERE collection_id=? AND doc_id=?", + (collection_id, doc_id)) + return cur.fetchall() + + def fix_documents(self): + cur = self.cursor() + cur.execute("SELECT issue_date, url FROM mdf_links") + for issue_date, url in cur.fetchall(): + pub_date, cid, did = retronews.parse_url(url) + cur.execute("UPDATE mdf_links SET collection_id=?, doc_id=? WHERE issue_date=?", + (cid, did, issue_date)) + self.commit() |