diff options
author | rusinthread <rusinthread@cock.li> | 2016-12-23 04:18:40 +0300 |
---|---|---|
committer | rusinthread <rusinthread@cock.li> | 2016-12-23 04:18:40 +0300 |
commit | 5298146ed6a1123a342aec3633debeba0ad372b7 (patch) | |
tree | 0eb2facb7a89084271c82551352468b5eb2865f1 | |
parent | f41257ca1f29172755ac0a4c10855685b54163bf (diff) |
parse flibusta url
-rwxr-xr-x | main.py | 29 |
1 files changed, 26 insertions, 3 deletions
@@ -26,8 +26,8 @@ def load_data(): def clean_string(s, remove_junk=False): s = s.replace(')', ') ') - s = re.sub(r'\!([^\)])', r'! \1', s) - s = s.replace('/', ' ') + s = re.sub(r'(\!|\.)([^\)])', r'\1 \2', s) + #s = s.replace('/', ' ') s = s.upper() s = re.sub(r'\s+', ' ', s).strip() @@ -55,6 +55,7 @@ def clean_string(s, remove_junk=False): 'ГУБЕРНИЯ', 'С ВЫШЕСТОЯЩИМИ КОНТРОЛЬ', 'С ЛОКАЦИИ', + 'SEARCHED', #'КАЗНЬ', 'ГУБЕРНИЯ', 'ПРОВЕРКИ', @@ -80,7 +81,26 @@ def clean_string(s, remove_junk=False): 'УКАЗАНО', '- ВЫСОКИЙ ТИТУЛ', 'ЗАКАЗ', - 'ЧЕРТЫ ЛИЦА' + 'ЧЕРТЫ ЛИЦА', + + # english + 'SCHOOL ON THE RIGHT', + 'WILL NOT ALLOW', + 'FLYWHEEL', + 'TRIUMPHANTLY', + #'UNACCEPTABLE', + 'BEING USED', + 'NICE', + 'UMBRELLA', + #'BIOROBOT', + 'CONSERVATISM', + 'WAS ESTABLISHED', + 'WITH A PASSWORD', + 'ANT', + 'YEAR', + 'RECOGNIZED' + #'LEGAL', + #'FIGHTING' ] # только без пробелов @@ -113,6 +133,9 @@ def decode(s): if re.match(r'^\d+\%$', word): buf += word + elif word.endswith('://'): + buf += word[0] + buf += '://' else: letter = word[0] buf += letter |