summaryrefslogtreecommitdiff
path: root/main.py
diff options
context:
space:
mode:
authorEvgeny Zinoviev <me@ch1p.io>2024-06-16 00:04:44 +0300
committerEvgeny Zinoviev <me@ch1p.io>2024-06-16 00:31:39 +0300
commit5fd7512f903522a47c416ebcda3b6acc6b080e49 (patch)
treed758fb0d5432e09edb44a1e39b92fd6724e7c4d4 /main.py
initialHEADmaster
Diffstat (limited to 'main.py')
-rwxr-xr-xmain.py179
1 files changed, 179 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..4057156
--- /dev/null
+++ b/main.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+import requests
+import threading
+import random
+import urllib3
+
+from queue import Queue
+from bs4 import BeautifulSoup
+from fb import Database
+from fb.util import get_fb_url, get_useragent
+from argparse import ArgumentParser, ArgumentError
+from sqlite3 import IntegrityError
+
+db: Database
+
+my_proxies = """
+127.0.0.1:1077
+127.0.0.1:1079
+127.0.0.1:1074
+127.0.0.1:1076
+127.0.0.1:1071
+127.0.0.1:1081
+127.0.0.1:1069
+"""
+my_proxies = list(set(my_proxies.strip().split()))
+
+
+class TooManyRequests(Exception):
+ pass
+
+
+def parse_book_page(book_id: int, proxy: str):
+ headers = {
+ 'User-Agent': get_useragent()
+ }
+
+ url = get_fb_url(book_id)
+ proxy = f'socks5://{proxy}'
+ r = requests.get(url,
+ headers=headers,
+ proxies=dict(http=proxy, https=proxy))
+ if r.status_code != 200:
+ if r.status_code == 429:
+ raise TooManyRequests()
+
+ # print(f'{book_id} code {r.status_code}')
+ return False
+
+ html = BeautifulSoup(r.text, "html.parser")
+ desc = html.select_one('meta[name="description"]')['content']
+
+ # extract useful info from meta tags
+ metainfo = []
+ try:
+ if desc.startswith('; '):
+ desc = desc[2:]
+ for item in desc.split('; '):
+ colon_pos = item.index(':')
+ key = item[0:colon_pos]
+ val = item[colon_pos+2:]
+ metainfo.append([key, val])
+ except ValueError:
+ metainfo.append(desc)
+ pass
+
+ # parse name and author
+ name = html.select_one('div.overview h1').text
+ author = html.select_one('div.overview h2 i').text
+
+ # parse breadcrumbs hierarchy
+ bc = html.select('ul.breadcrumb li.breadcrumb-item')
+ bc_hierarchy = []
+ bc_first_skipped = False
+ if bc:
+ for bc_item in bc:
+ if not bc_first_skipped:
+ bc_first_skipped = True
+ continue
+ bc_hierarchy.append(bc_item.text)
+
+ # book info table
+ details = {}
+ rows = html.select('table[width="400"] tr')
+ if rows:
+ for r in rows:
+ cols = r.select('td')
+ details[cols[0].text] = cols[1].text
+
+ db.add_book(book_id, name, author, metainfo, bc_hierarchy, details)
+ return True
+
+
+def worker(task_queue, print_lock, proxy):
+ while not task_queue.empty():
+ book_id = task_queue.get()
+ db_error = False
+ result = None
+
+ try:
+ result = parse_book_page(book_id, proxy)
+ except IntegrityError:
+ db_error = True
+ except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests):
+ task_queue.put(book_id)
+ db.add_failed_book(book_id)
+ print(f'{book_id}: failed due to network error, proxy = {proxy}')
+ continue
+ except requests.exceptions.ChunkedEncodingError:
+ print(f'{book_id} causes weird error')
+ continue
+
+ if result is not False:
+ with print_lock:
+ print(f"{book_id} " + ("done" if not db_error else " raised db error"))
+ task_queue.task_done()
+
+
+if __name__ == '__main__':
+ parser = ArgumentParser()
+ parser.add_argument('--book-id', type=int)
+ parser.add_argument('--continue', action='store_true')
+ parser.add_argument('--max-book-id', type=int, default=1500000)
+ parser.add_argument('--find-gaps', action='store_true')
+ args = parser.parse_args()
+
+ db = Database()
+
+ if args.find_gaps:
+ id_from = 100000
+ id_to = 1400000
+ ids_in_db = db.get_ids(id_from, id_to)
+ task_queue = Queue()
+ print_lock = threading.Lock()
+
+ for i in range(id_from, id_to+1):
+ if i not in ids_in_db:
+ task_queue.put(i)
+
+ threads = []
+ for proxy in my_proxies:
+ for i in range(4):
+ thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
+ thread.start()
+ threads.append(thread)
+
+ for thread in threads:
+ thread.join()
+
+ elif hasattr(args, 'continue') and getattr(args, 'continue') is True:
+ if args.book_id:
+ last_book_id = args.book_id
+ else:
+ last_book_id = db.get_max_book_id()
+ if last_book_id is None:
+ last_book_id = 0
+
+ task_queue = Queue()
+ print_lock = threading.Lock()
+
+ for task_number in range(last_book_id + 1, args.max_book_id):
+ task_queue.put(task_number)
+
+ threads = []
+ for proxy in my_proxies:
+ for i in range(3):
+ thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy))
+ thread.start()
+ threads.append(thread)
+
+ for thread in threads:
+ thread.join()
+ else:
+ if not args.book_id:
+ raise ArgumentError(None, '--book-id is required')
+ proxy = random.choice(my_proxies)
+ book = db.get_book(args.book_id)
+ if book:
+ raise RuntimeError('this book is already in the database')
+ parse_book_page(args.book_id, proxy)