From 1b8b56061ba92d7ab27cbd33ce15bcf9591b25ea Mon Sep 17 00:00:00 2001 From: Evgeny Sorokin Date: Thu, 11 Apr 2024 00:16:03 +0300 Subject: initial --- .gitignore | 1 + README.txt | 3 +++ main.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 .gitignore create mode 100644 README.txt create mode 100755 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a09c56d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..15f0295 --- /dev/null +++ b/README.txt @@ -0,0 +1,3 @@ +How to use: + + ./main.py --collection-url https://www.dvidshub.net/publication/1340/national-cryptologic-museum-library --output-directory /where-to-save diff --git a/main.py b/main.py new file mode 100755 index 0000000..0921d76 --- /dev/null +++ b/main.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import requests +import os + +from bs4 import BeautifulSoup +from argparse import ArgumentParser + + +class DVIDSParser: + def __init__(self, url, output): + self._url = url + self._output = output + + def go(self): + years = self.get_years() + for y in years: + pages = self.get_pages_count(y) + for page in range(pages): + url = f'{self._url}?issueYearDropDown={y}&sortOptions=DESC&page={page+1}' + r = requests.get(url) + html = BeautifulSoup(r.text, 'html.parser') + + for pub in html.select('.publication-item'): + title = pub.select_one('div:not(.uk-row) > p:not(.issue-downloads):not(.issue-hits)').text.strip() + pdf_id = os.path.basename(pub.select_one('a')['href']) + pdf_url = f'https://media-cdn.dvidshub.net/pubs/pdf_{pdf_id}.pdf' + filename = f'{title} ({pdf_id}).pdf' + self.save(pdf_url, filename) + print(f'[{y}/{page}/{pdf_id}] {title} OK') + + def get_pages_count(self, year: int) -> int: + url = f'{self._url}/page/999/?issueYearDropDown={year}&sortOptions=DESC' + r = requests.get(url) + html = BeautifulSoup(r.text, 'html.parser') + return int(html.select_one('li.uk-active > span').text) + + def get_years(self) -> list[int]: + r = requests.get(self._url) + html = BeautifulSoup(r.text, 'html.parser') + return list(map(lambda opt: int(opt['value']), html.select_one('#issueYearDropDown').select('option'))) + + def save(self, url, filename): + streamed_response = requests.get(url, stream=True) + with open(os.path.join(self._output, filename), 'wb') as file: + for chunk in streamed_response.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + file.write(chunk) + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--collection-url', type=str, required=True) + # parser.add_argument('--pages', type=int, default=1) + parser.add_argument('--output-directory', type=str, required=True, + help='Directory where to save files. If does not exist, it will be created') + args = parser.parse_args() + + if os.path.exists(args.output_directory) and not os.path.isdir(args.output_directory): + raise OSError('specified path is not a directory') + elif not os.path.exists(args.output_directory): + os.makedirs(args.output_directory) + + p = DVIDSParser(url=args.collection_url, + output=args.output_directory) + p.go() -- cgit v1.2.3