initialHEAD master

author: Evgeny Zinoviev <me@ch1p.io> 2024-06-16 00:04:44 +0300
committer: Evgeny Zinoviev <me@ch1p.io> 2024-06-16 00:31:39 +0300
commit: 5fd7512f903522a47c416ebcda3b6acc6b080e49 (patch)
tree: d758fb0d5432e09edb44a1e39b92fd6724e7c4d4 /main.py
1 files changed, 179 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..4057156
--- /dev/null
+++ b/main.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+import requests
+import threading
+import random
+import urllib3
+
+from queue import Queue
+from bs4 import BeautifulSoup
+from fb import Database
+from fb.util import get_fb_url, get_useragent
+from argparse import ArgumentParser, ArgumentError
+from sqlite3 import IntegrityError
+
+db: Database
+
+my_proxies = """
+127.0.0.1:1077
+127.0.0.1:1079
+127.0.0.1:1074
+127.0.0.1:1076
+127.0.0.1:1071
+127.0.0.1:1081
+127.0.0.1:1069
+"""
+my_proxies = list(set(my_proxies.strip().split()))
+
+
+class TooManyRequests(Exception):
+    pass
+
+
+def parse_book_page(book_id: int, proxy: str):
+    headers = {
+        'User-Agent': get_useragent()
+    }
+
+    url = get_fb_url(book_id)
+    proxy = f'socks5://{proxy}'
+    r = requests.get(url,
+                     headers=headers,
+                     proxies=dict(http=proxy, https=proxy))
+    if r.status_code != 200:
+        if r.status_code == 429:
+            raise TooManyRequests()
+
+        # print(f'{book_id} code {r.status_code}')
+        return False
+
+    html = BeautifulSoup(r.text, "html.parser")
+    desc = html.select_one('meta[name="description"]')['content']
+
+    # extract useful info from meta tags
+    metainfo = []
+    try:
+        if desc.startswith('; '):
+            desc = desc[2:]
+        for item in desc.split('; '):
+            colon_pos = item.index(':')
+            key = item[0:colon_pos]
+            val = item[colon_pos+2:]
+            metainfo.append([key, val])
+    except ValueError:
+        metainfo.append(desc)
+        pass
+
+    # parse name and author
+    name = html.select_one('div.overview h1').text
+    author = html.select_one('div.overview h2 i').text
+
+    # parse breadcrumbs hierarchy
+    bc = html.select('ul.breadcrumb li.breadcrumb-item')
+    bc_hierarchy = []
+    bc_first_skipped = False
+    if bc:
+        for bc_item in bc:
+            if not bc_first_skipped:
+                bc_first_skipped = True
+                continue
+            bc_hierarchy.append(bc_item.text)
+
+    # book info table
+    details = {}
+    rows = html.select('table[width="400"] tr')
+    if rows:
+        for r in rows:
+            cols = r.select('td')
+            details[cols[0].text] = cols[1].text
+
+    db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
+    return True
+
+
+def worker(task_queue, print_lock, proxy):
+    while not task_queue.empty():
+        book_id = task_queue.get()
+        db_error = False
+        result = None
+
+        try:
+            result = parse_book_page(book_id, proxy)
+        except IntegrityError:
+            db_error = True
+        except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
+            task_queue.put(book_id)
+            db.add_failed_book(book_id)
+            print(f'{book_id}: failed due to network error, proxy = {proxy}')
+            continue
+        except requests.exceptions.ChunkedEncodingError:
+            print(f'{book_id} causes weird error')
+            continue
+
+        if result is not False:
+            with print_lock:
+                print(f"{book_id} " + ("done" if not db_error else " raised db error"))
+        task_queue.task_done()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--book-id', type=int)
+    parser.add_argument('--continue', action='store_true')
+    parser.add_argument('--max-book-id', type=int, default=1500000)
+    parser.add_argument('--find-gaps', action='store_true')
+    args = parser.parse_args()
+
+    db = Database()
+
+    if args.find_gaps:
+        id_from = 100000
+        id_to = 1400000
+        ids_in_db = db.get_ids(id_from, id_to)
+        task_queue = Queue()
+        print_lock = threading.Lock()
+
+        for i in range(id_from, id_to+1):
+            if i not in ids_in_db:
+                task_queue.put(i)
+
+        threads = []
+        for proxy in my_proxies:
+            for i in range(4):
+                thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
+                thread.start()
+                threads.append(thread)
+
+        for thread in threads:
+            thread.join()
+
+    elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
+        if args.book_id:
+            last_book_id = args.book_id
+        else:
+            last_book_id = db.get_max_book_id()
+            if last_book_id is None:
+                last_book_id = 0
+
+        task_queue = Queue()
+        print_lock = threading.Lock()
+
+        for task_number in range(last_book_id + 1, args.max_book_id):
+            task_queue.put(task_number)
+
+        threads = []
+        for proxy in my_proxies:
+            for i in range(3):
+                thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
+                thread.start()
+                threads.append(thread)
+
+        for thread in threads:
+            thread.join()
+    else:
+        if not args.book_id:
+            raise ArgumentError(None, '--book-id is required')
+        proxy = random.choice(my_proxies)
+        book = db.get_book(args.book_id)
+        if book:
+            raise RuntimeError('this book is already in the database')
+        parse_book_page(args.book_id, proxy)
author	Evgeny Zinoviev <me@ch1p.io>	2024-06-16 00:04:44 +0300
committer	Evgeny Zinoviev <me@ch1p.io>	2024-06-16 00:31:39 +0300
commit	5fd7512f903522a47c416ebcda3b6acc6b080e49 (patch)
tree	d758fb0d5432e09edb44a1e39b92fd6724e7c4d4 /main.py