summaryrefslogtreecommitdiff
path: root/search_2ch_archive.py
diff options
context:
space:
mode:
authorrusinthread <rusinthread@cock.li>2017-01-07 00:37:00 +0300
committerrusinthread <rusinthread@cock.li>2017-01-07 00:37:00 +0300
commit52602304e44be93124707f719015f0962f4ae1ad (patch)
treeead58ca704400e9a1db81044a01d808c5b5301b1 /search_2ch_archive.py
parent40fb54b80d7fa146683623dc0d7f17783f591017 (diff)
update readme, rename searching script
Diffstat (limited to 'search_2ch_archive.py')
-rwxr-xr-xsearch_2ch_archive.py49
1 files changed, 49 insertions, 0 deletions
diff --git a/search_2ch_archive.py b/search_2ch_archive.py
new file mode 100755
index 0000000..2ddb211
--- /dev/null
+++ b/search_2ch_archive.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+import requests, re
+#import sys
+
+def test_link_text(text):
+ text = text.upper()
+ words = (
+ 'ЭРДОГАН',
+ 'ТРИУМФАЛЬНО',
+ 'ВОЕННОЕ ВМЕШАТЕЛЬСТВО',
+ 'ИНДЕКСИРОВАН БЕЗУКОРИЗНЕННО',
+ 'ЗАКАДЫЧНО[ПАРРОМА]',
+ 'РОБОТИЧЕСКИ',
+ 'ЫТРЭЧ',
+ 'АКРОБАТИЧЕСКОЕ',
+ 'МЕСТА(!)',
+ 'СУХОГРУЗ',
+ 'ГАЗ-53',
+ 'ОТКЛАДЫВАЕТСЯ ЛИНЕЙНО',
+ 'ДЕЖУРНЫЕ С ВЫШЕСТОЯЩИМИ',
+ 'Х О Р Т И Ц А',
+ 'ЯРОСЛАВСКАЯ ГУБЕРНИЯ',
+ 'ПРИСУТСТВИЕ ВОЕННОЕ',
+ 'ИМЕНИЯ ВЫШЕСТОЯЩИХ'
+ )
+ for w in words:
+ if w in text:
+ return True
+ return False
+
+def full_url(url):
+ return 'https://2ch.hk' + url
+
+def find_triumfalno():
+ page = 500
+ board = "b"
+ while page <= 600:
+ #print("fetching page %d" % page)
+ url = "https://2ch.hk/%s/arch/%d.html" % (board, page)
+ r = requests.get(url)
+ for a in re.finditer(r'<a href="(/'+board+'/arch/[\d-]+/res/\d+\.html)">(.*?)</a>', r.text, flags=re.I|re.M):
+ link_href = a.group(1)
+ link_text = a.group(2)
+ if test_link_text(link_text):
+ print("%s => %s" % (full_url(link_href), link_text))
+ page += 1
+
+if __name__ == '__main__':
+ find_triumfalno()