diff options
author | Evgeny Sorokin <me@ch1p.io> | 2024-04-11 00:16:03 +0300 |
---|---|---|
committer | Evgeny Sorokin <me@ch1p.io> | 2024-04-11 00:16:03 +0300 |
commit | 1b8b56061ba92d7ab27cbd33ce15bcf9591b25ea (patch) | |
tree | f58127d12335ae8d9b2cfc10a9538aeff754e72e |
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | README.txt | 3 | ||||
-rwxr-xr-x | main.py | 65 |
3 files changed, 69 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a09c56d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..15f0295 --- /dev/null +++ b/README.txt @@ -0,0 +1,3 @@ +How to use: + + ./main.py --collection-url https://www.dvidshub.net/publication/1340/national-cryptologic-museum-library --output-directory /where-to-save @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import requests +import os + +from bs4 import BeautifulSoup +from argparse import ArgumentParser + + +class DVIDSParser: + def __init__(self, url, output): + self._url = url + self._output = output + + def go(self): + years = self.get_years() + for y in years: + pages = self.get_pages_count(y) + for page in range(pages): + url = f'{self._url}?issueYearDropDown={y}&sortOptions=DESC&page={page+1}' + r = requests.get(url) + html = BeautifulSoup(r.text, 'html.parser') + + for pub in html.select('.publication-item'): + title = pub.select_one('div:not(.uk-row) > p:not(.issue-downloads):not(.issue-hits)').text.strip() + pdf_id = os.path.basename(pub.select_one('a')['href']) + pdf_url = f'https://media-cdn.dvidshub.net/pubs/pdf_{pdf_id}.pdf' + filename = f'{title} ({pdf_id}).pdf' + self.save(pdf_url, filename) + print(f'[{y}/{page}/{pdf_id}] {title} OK') + + def get_pages_count(self, year: int) -> int: + url = f'{self._url}/page/999/?issueYearDropDown={year}&sortOptions=DESC' + r = requests.get(url) + html = BeautifulSoup(r.text, 'html.parser') + return int(html.select_one('li.uk-active > span').text) + + def get_years(self) -> list[int]: + r = requests.get(self._url) + html = BeautifulSoup(r.text, 'html.parser') + return list(map(lambda opt: int(opt['value']), html.select_one('#issueYearDropDown').select('option'))) + + def save(self, url, filename): + streamed_response = requests.get(url, stream=True) + with open(os.path.join(self._output, filename), 'wb') as file: + for chunk in streamed_response.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + file.write(chunk) + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--collection-url', type=str, required=True) + # parser.add_argument('--pages', type=int, default=1) + parser.add_argument('--output-directory', type=str, required=True, + help='Directory where to save files. If does not exist, it will be created') + args = parser.parse_args() + + if os.path.exists(args.output_directory) and not os.path.isdir(args.output_directory): + raise OSError('specified path is not a directory') + elif not os.path.exists(args.output_directory): + os.makedirs(args.output_directory) + + p = DVIDSParser(url=args.collection_url, + output=args.output_directory) + p.go() |