#!/usr/bin/env python3 import requests import threading import random import urllib3 from queue import Queue from bs4 import BeautifulSoup from fb import Database from fb.util import get_fb_url, get_useragent from argparse import ArgumentParser, ArgumentError from sqlite3 import IntegrityError db: Database my_proxies = """ 127.0.0.1:1077 127.0.0.1:1079 127.0.0.1:1074 127.0.0.1:1076 127.0.0.1:1071 127.0.0.1:1081 127.0.0.1:1069 """ my_proxies = list(set(my_proxies.strip().split())) class TooManyRequests(Exception): pass def parse_book_page(book_id: int, proxy: str): headers = { 'User-Agent': get_useragent() } url = get_fb_url(book_id) proxy = f'socks5://{proxy}' r = requests.get(url, headers=headers, proxies=dict(http=proxy, https=proxy)) if r.status_code != 200: if r.status_code == 429: raise TooManyRequests() # print(f'{book_id} code {r.status_code}') return False html = BeautifulSoup(r.text, "html.parser") desc = html.select_one('meta[name="description"]')['content'] # extract useful info from meta tags metainfo = [] try: if desc.startswith('; '): desc = desc[2:] for item in desc.split('; '): colon_pos = item.index(':') key = item[0:colon_pos] val = item[colon_pos+2:] metainfo.append([key, val]) except ValueError: metainfo.append(desc) pass # parse name and author name = html.select_one('div.overview h1').text author = html.select_one('div.overview h2 i').text # parse breadcrumbs hierarchy bc = html.select('ul.breadcrumb li.breadcrumb-item') bc_hierarchy = [] bc_first_skipped = False if bc: for bc_item in bc: if not bc_first_skipped: bc_first_skipped = True continue bc_hierarchy.append(bc_item.text) # book info table details = {} rows = html.select('table[width="400"] tr') if rows: for r in rows: cols = r.select('td') details[cols[0].text] = cols[1].text db.add_book(book_id, name, author, metainfo, bc_hierarchy, details) return True def worker(task_queue, print_lock, proxy): while not task_queue.empty(): book_id = task_queue.get() db_error = False result = None try: result = parse_book_page(book_id, proxy) except IntegrityError: db_error = True except (requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, urllib3.exceptions.ProtocolError, TooManyRequests): task_queue.put(book_id) db.add_failed_book(book_id) print(f'{book_id}: failed due to network error, proxy = {proxy}') continue except requests.exceptions.ChunkedEncodingError: print(f'{book_id} causes weird error') continue if result is not False: with print_lock: print(f"{book_id} " + ("done" if not db_error else " raised db error")) task_queue.task_done() if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--book-id', type=int) parser.add_argument('--continue', action='store_true') parser.add_argument('--max-book-id', type=int, default=1500000) parser.add_argument('--find-gaps', action='store_true') args = parser.parse_args() db = Database() if args.find_gaps: id_from = 100000 id_to = 1400000 ids_in_db = db.get_ids(id_from, id_to) task_queue = Queue() print_lock = threading.Lock() for i in range(id_from, id_to+1): if i not in ids_in_db: task_queue.put(i) threads = [] for proxy in my_proxies: for i in range(4): thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy)) thread.start() threads.append(thread) for thread in threads: thread.join() elif hasattr(args, 'continue') and getattr(args, 'continue') is True: if args.book_id: last_book_id = args.book_id else: last_book_id = db.get_max_book_id() if last_book_id is None: last_book_id = 0 task_queue = Queue() print_lock = threading.Lock() for task_number in range(last_book_id + 1, args.max_book_id): task_queue.put(task_number) threads = [] for proxy in my_proxies: for i in range(3): thread = threading.Thread(target=worker, args=(task_queue, print_lock, proxy)) thread.start() threads.append(thread) for thread in threads: thread.join() else: if not args.book_id: raise ArgumentError(None, '--book-id is required') proxy = random.choice(my_proxies) book = db.get_book(args.book_id) if book: raise RuntimeError('this book is already in the database') parse_book_page(args.book_id, proxy)