summaryrefslogtreecommitdiff
path: root/main.py
diff options
context:
space:
mode:
authorEvgeny Sorokin <me@ch1p.io>2024-04-11 00:16:03 +0300
committerEvgeny Sorokin <me@ch1p.io>2024-04-11 00:16:03 +0300
commit1b8b56061ba92d7ab27cbd33ce15bcf9591b25ea (patch)
treef58127d12335ae8d9b2cfc10a9538aeff754e72e /main.py
initialHEADmaster
Diffstat (limited to 'main.py')
-rwxr-xr-xmain.py65
1 files changed, 65 insertions, 0 deletions
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..0921d76
--- /dev/null
+++ b/main.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import requests
+import os
+
+from bs4 import BeautifulSoup
+from argparse import ArgumentParser
+
+
+class DVIDSParser:
+ def __init__(self, url, output):
+ self._url = url
+ self._output = output
+
+ def go(self):
+ years = self.get_years()
+ for y in years:
+ pages = self.get_pages_count(y)
+ for page in range(pages):
+ url = f'{self._url}?issueYearDropDown={y}&sortOptions=DESC&page={page+1}'
+ r = requests.get(url)
+ html = BeautifulSoup(r.text, 'html.parser')
+
+ for pub in html.select('.publication-item'):
+ title = pub.select_one('div:not(.uk-row) > p:not(.issue-downloads):not(.issue-hits)').text.strip()
+ pdf_id = os.path.basename(pub.select_one('a')['href'])
+ pdf_url = f'https://media-cdn.dvidshub.net/pubs/pdf_{pdf_id}.pdf'
+ filename = f'{title} ({pdf_id}).pdf'
+ self.save(pdf_url, filename)
+ print(f'[{y}/{page}/{pdf_id}] {title} OK')
+
+ def get_pages_count(self, year: int) -> int:
+ url = f'{self._url}/page/999/?issueYearDropDown={year}&sortOptions=DESC'
+ r = requests.get(url)
+ html = BeautifulSoup(r.text, 'html.parser')
+ return int(html.select_one('li.uk-active > span').text)
+
+ def get_years(self) -> list[int]:
+ r = requests.get(self._url)
+ html = BeautifulSoup(r.text, 'html.parser')
+ return list(map(lambda opt: int(opt['value']), html.select_one('#issueYearDropDown').select('option')))
+
+ def save(self, url, filename):
+ streamed_response = requests.get(url, stream=True)
+ with open(os.path.join(self._output, filename), 'wb') as file:
+ for chunk in streamed_response.iter_content(chunk_size=1024):
+ if chunk: # filter out keep-alive new chunks
+ file.write(chunk)
+
+
+if __name__ == '__main__':
+ parser = ArgumentParser()
+ parser.add_argument('--collection-url', type=str, required=True)
+ # parser.add_argument('--pages', type=int, default=1)
+ parser.add_argument('--output-directory', type=str, required=True,
+ help='Directory where to save files. If does not exist, it will be created')
+ args = parser.parse_args()
+
+ if os.path.exists(args.output_directory) and not os.path.isdir(args.output_directory):
+ raise OSError('specified path is not a directory')
+ elif not os.path.exists(args.output_directory):
+ os.makedirs(args.output_directory)
+
+ p = DVIDSParser(url=args.collection_url,
+ output=args.output_directory)
+ p.go()