From 5fd7512f903522a47c416ebcda3b6acc6b080e49 Mon Sep 17 00:00:00 2001 From: Evgeny Zinoviev Date: Sun, 16 Jun 2024 00:04:44 +0300 Subject: initial --- main.py | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100755 main.py (limited to 'main.py') diff --git a/main.py b/main.py new file mode 100755 index 0000000..4057156 --- /dev/null +++ b/main.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +import requests +import threading +import random +import urllib3 + +from queue import Queue +from bs4 import BeautifulSoup +from fb import Database +from fb.util import get_fb_url, get_useragent +from argparse import ArgumentParser, ArgumentError +from sqlite3 import IntegrityError + +db: Database + +my_proxies = """ +127.0.0.1:1077 +127.0.0.1:1079 +127.0.0.1:1074 +127.0.0.1:1076 +127.0.0.1:1071 +127.0.0.1:1081 +127.0.0.1:1069 +""" +my_proxies = list(set(my_proxies.strip().split())) + + +class TooManyRequests(Exception): + pass + + +def parse_book_page(book_id: int, proxy: str): + headers = { + 'User-Agent': get_useragent() + } + + url = get_fb_url(book_id) + proxy = f'socks5://{proxy}' + r = requests.get(url, + headers=headers, + proxies=dict(http=proxy, https=proxy)) + if r.status_code != 200: + if r.status_code == 429: + raise TooManyRequests() + + # print(f'{book_id} code {r.status_code}') + return False + + html = BeautifulSoup(r.text, "html.parser") + desc = html.select_one('meta[name="description"]')['content'] + + # extract useful info from meta tags + metainfo = [] + try: + if desc.startswith('; '): + desc = desc[2:] + for item in desc.split('; '): + colon_pos = item.index(':') + key = item[0:colon_pos] + val = item[colon_pos+2:] + metainfo.append([key, val]) + except ValueError: + metainfo.append(desc) + pass + + # parse name and author + name = html.select_one('div.overview h1').text + author = html.select_one('div.overview h2 i').text + + # parse breadcrumbs hierarchy + bc = html.select('ul.breadcrumb li.breadcrumb-item') + bc_hierarchy = [] + bc_first_skipped = False + if bc: + for bc_item in bc: + if not bc_first_skipped: + bc_first_skipped = True + continue + bc_hierarchy.append(bc_item.text) + + # book info table + details = {} + rows = html.select('table[width="400"] tr') + if rows: + for r in rows: + cols = r.select('td') + details[cols[0].text] = cols[1].text + + db.add_book(book_id, name, author, metainfo, bc_hierarchy, details) + return True + + +def worker(task_queue, print_lock, proxy): + while not task_queue.empty(): + book_id = task_queue.get() + db_error = False + result = None + + try: + result = parse_book_page(book_id, proxy) + except IntegrityError: + db_error = True + except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests): + task_queue.put(book_id) + db.add_failed_book(book_id) + print(f'{book_id}: failed due to network error, proxy = {proxy}') + continue + except requests.exceptions.ChunkedEncodingError: + print(f'{book_id} causes weird error') + continue + + if result is not False: + with print_lock: + print(f"{book_id} " + ("done" if not db_error else " raised db error")) + task_queue.task_done() + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--book-id', type=int) + parser.add_argument('--continue', action='store_true') + parser.add_argument('--max-book-id', type=int, default=1500000) + parser.add_argument('--find-gaps', action='store_true') + args = parser.parse_args() + + db = Database() + + if args.find_gaps: + id_from = 100000 + id_to = 1400000 + ids_in_db = db.get_ids(id_from, id_to) + task_queue = Queue() + print_lock = threading.Lock() + + for i in range(id_from, id_to+1): + if i not in ids_in_db: + task_queue.put(i) + + threads = [] + for proxy in my_proxies: + for i in range(4): + thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy)) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + + elif hasattr(args, 'continue') and getattr(args, 'continue') is True: + if args.book_id: + last_book_id = args.book_id + else: + last_book_id = db.get_max_book_id() + if last_book_id is None: + last_book_id = 0 + + task_queue = Queue() + print_lock = threading.Lock() + + for task_number in range(last_book_id + 1, args.max_book_id): + task_queue.put(task_number) + + threads = [] + for proxy in my_proxies: + for i in range(3): + thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy)) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + else: + if not args.book_id: + raise ArgumentError(None, '--book-id is required') + proxy = random.choice(my_proxies) + book = db.get_book(args.book_id) + if book: + raise RuntimeError('this book is already in the database') + parse_book_page(args.book_id, proxy) -- cgit v1.2.3