summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--README.txt3
-rwxr-xr-xmain.py65
3 files changed, 69 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a09c56d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/.idea
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..15f0295
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,3 @@
+How to use:
+
+ ./main.py --collection-url https://www.dvidshub.net/publication/1340/national-cryptologic-museum-library --output-directory /where-to-save
diff --git a/main.py b/main.py
new file mode 100755
index 0000000..0921d76
--- /dev/null
+++ b/main.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import requests
+import os
+
+from bs4 import BeautifulSoup
+from argparse import ArgumentParser
+
+
+class DVIDSParser:
+ def __init__(self, url, output):
+ self._url = url
+ self._output = output
+
+ def go(self):
+ years = self.get_years()
+ for y in years:
+ pages = self.get_pages_count(y)
+ for page in range(pages):
+ url = f'{self._url}?issueYearDropDown={y}&sortOptions=DESC&page={page+1}'
+ r = requests.get(url)
+ html = BeautifulSoup(r.text, 'html.parser')
+
+ for pub in html.select('.publication-item'):
+ title = pub.select_one('div:not(.uk-row) > p:not(.issue-downloads):not(.issue-hits)').text.strip()
+ pdf_id = os.path.basename(pub.select_one('a')['href'])
+ pdf_url = f'https://media-cdn.dvidshub.net/pubs/pdf_{pdf_id}.pdf'
+ filename = f'{title} ({pdf_id}).pdf'
+ self.save(pdf_url, filename)
+ print(f'[{y}/{page}/{pdf_id}] {title} OK')
+
+ def get_pages_count(self, year: int) -> int:
+ url = f'{self._url}/page/999/?issueYearDropDown={year}&sortOptions=DESC'
+ r = requests.get(url)
+ html = BeautifulSoup(r.text, 'html.parser')
+ return int(html.select_one('li.uk-active > span').text)
+
+ def get_years(self) -> list[int]:
+ r = requests.get(self._url)
+ html = BeautifulSoup(r.text, 'html.parser')
+ return list(map(lambda opt: int(opt['value']), html.select_one('#issueYearDropDown').select('option')))
+
+ def save(self, url, filename):
+ streamed_response = requests.get(url, stream=True)
+ with open(os.path.join(self._output, filename), 'wb') as file:
+ for chunk in streamed_response.iter_content(chunk_size=1024):
+ if chunk: # filter out keep-alive new chunks
+ file.write(chunk)
+
+
+if __name__ == '__main__':
+ parser = ArgumentParser()
+ parser.add_argument('--collection-url', type=str, required=True)
+ # parser.add_argument('--pages', type=int, default=1)
+ parser.add_argument('--output-directory', type=str, required=True,
+ help='Directory where to save files. If does not exist, it will be created')
+ args = parser.parse_args()
+
+ if os.path.exists(args.output_directory) and not os.path.isdir(args.output_directory):
+ raise OSError('specified path is not a directory')
+ elif not os.path.exists(args.output_directory):
+ os.makedirs(args.output_directory)
+
+ p = DVIDSParser(url=args.collection_url,
+ output=args.output_directory)
+ p.go()