From 5298146ed6a1123a342aec3633debeba0ad372b7 Mon Sep 17 00:00:00 2001 From: rusinthread Date: Fri, 23 Dec 2016 04:18:40 +0300 Subject: parse flibusta url --- main.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 5e50913..48ac788 100755 --- a/main.py +++ b/main.py @@ -26,8 +26,8 @@ def load_data(): def clean_string(s, remove_junk=False): s = s.replace(')', ') ') - s = re.sub(r'\!([^\)])', r'! \1', s) - s = s.replace('/', ' ') + s = re.sub(r'(\!|\.)([^\)])', r'\1 \2', s) + #s = s.replace('/', ' ') s = s.upper() s = re.sub(r'\s+', ' ', s).strip() @@ -55,6 +55,7 @@ def clean_string(s, remove_junk=False): 'ГУБЕРНИЯ', 'С ВЫШЕСТОЯЩИМИ КОНТРОЛЬ', 'С ЛОКАЦИИ', + 'SEARCHED', #'КАЗНЬ', 'ГУБЕРНИЯ', 'ПРОВЕРКИ', @@ -80,7 +81,26 @@ def clean_string(s, remove_junk=False): 'УКАЗАНО', '- ВЫСОКИЙ ТИТУЛ', 'ЗАКАЗ', - 'ЧЕРТЫ ЛИЦА' + 'ЧЕРТЫ ЛИЦА', + + # english + 'SCHOOL ON THE RIGHT', + 'WILL NOT ALLOW', + 'FLYWHEEL', + 'TRIUMPHANTLY', + #'UNACCEPTABLE', + 'BEING USED', + 'NICE', + 'UMBRELLA', + #'BIOROBOT', + 'CONSERVATISM', + 'WAS ESTABLISHED', + 'WITH A PASSWORD', + 'ANT', + 'YEAR', + 'RECOGNIZED' + #'LEGAL', + #'FIGHTING' ] # только без пробелов @@ -113,6 +133,9 @@ def decode(s): if re.match(r'^\d+\%$', word): buf += word + elif word.endswith('://'): + buf += word[0] + buf += '://' else: letter = word[0] buf += letter -- cgit v1.2.3