summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrusinthread <rusinthread@cock.li>2017-02-14 16:47:28 +0300
committerrusinthread <rusinthread@cock.li>2017-02-14 16:47:28 +0300
commit2bf675631511bec3cd7e41dd347523ef59715e0c (patch)
tree3fe2562f0c0fe4e04da73a520ac8cf0f1af3209a
parentad2ed5d7dce466ad7f52ccfa4792c678820b3d89 (diff)
add new lib
-rw-r--r--README.md4
-rw-r--r--data_lib.py8
-rw-r--r--util.py334
3 files changed, 344 insertions, 2 deletions
diff --git a/README.md b/README.md
index 10f11ff..d4e2ba0 100644
--- a/README.md
+++ b/README.md
@@ -49,9 +49,11 @@
Скрипт `analyze_new.py` - черновик частотного анализа положения букв для расшифровки шифртекстов 3-го типа, тексты которых лежат в папке `new` (см. [data.md](./data.md) о типах).
+В `util.py` полезные функции для анализа с красочным выводом.
+
## Требования
-Работает на Python 3. Рекомендуется установить модуль `termcolor` чтобы вывод был цветным.
+Работает на Python 3. Рекомендуется установить модуль `termcolor` чтобы вывод был цветным (а util.py вообще не умеет без него работать).
Для `search_2ch_archive.py` нужен модуль `requests`.
diff --git a/data_lib.py b/data_lib.py
index 5bf468e..fea488f 100644
--- a/data_lib.py
+++ b/data_lib.py
@@ -13,7 +13,7 @@ def _data_sort_date(i):
return int(time.mktime(datetime.datetime.strptime(i['date'], '%d/%m/%y').timetuple()))
# sort: 'len', 'date'
-def load_data(sort='len', sort_reverse=False):
+def load_data(sort='len', sort_reverse=False, date=None, type=None):
with open(os.path.join(CWD, "data.json")) as f:
data = json.loads(f.read())
@@ -28,8 +28,14 @@ def load_data(sort='len', sort_reverse=False):
else:
raise Error("Unknown sort type " + str(sort))
+ # filter by date
data = sorted(data, key=sort_f, reverse=sort_reverse)
+ if date:
+ data = list(filter(lambda i: 'date' in i and i['date'] == date, data))
+
+ if type:
+ data = list(filter(lambda i: 'type' in i and i['type'] == type, data))
return data
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..8bb449a
--- /dev/null
+++ b/util.py
@@ -0,0 +1,334 @@
+import re, sys
+from pprint import pprint
+import operator
+from termcolor import cprint
+import itertools
+
+RU_ALPHABET = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
+GLAS_LETTERS = 'аеёиоуыэюя'
+SOGLAS_LETTERS = 'бвгджзйклмнпрстфхцчшщъь'
+
+RU_FREQ = {
+ 'о': 0.10983,
+ 'е': 0.08483,
+ 'а': 0.07998,
+ 'и': 0.07367,
+ 'н': 0.067,
+ 'т': 0.06318,
+ 'с': 0.05473,
+ 'р': 0.04746,
+ 'в': 0.04533,
+ 'л': 0.04343,
+ 'к': 0.03486,
+ 'м': 0.03203,
+ 'д': 0.02977,
+ 'п': 0.02804,
+ 'у': 0.02615,
+ 'я': 0.02001,
+ 'ы': 0.01898,
+ 'ь': 0.01735,
+ 'г': 0.01687,
+ 'з': 0.01641,
+ 'б': 0.01592,
+ 'ч': 0.0145,
+ 'й': 0.01208,
+ 'х': 0.00966,
+ 'ж': 0.0094,
+ 'ш': 0.00718,
+ 'ю': 0.00639,
+ 'ц': 0.00486,
+ 'щ': 0.00361,
+ 'э': 0.00331,
+ 'ф': 0.00267,
+ 'ъ': 0.00037,
+ 'ё': 0.00013
+}
+
+ENG_FREQ = {
+ 'A': 8.1,
+ 'B': 1.4,
+ 'C': 2.7,
+ 'D': 3.9,
+ 'E': 13.0,
+ 'F': 2.9,
+ 'G': 2.0,
+ 'H': 5.2,
+ 'I': 6.5,
+ 'J': 0.2,
+ 'K': 0.4,
+ 'L': 3.4,
+ 'M': 2.5,
+ 'N': 7.2,
+ 'O': 7.9,
+ 'P': 2.0,
+ 'R': 6.9,
+ 'S': 6.1,
+ 'T': 10.5,
+ 'U': 2.4,
+ 'V': 0.9,
+ 'W': 1.5,
+ 'X': 0.2,
+ 'Y': 1.9,
+ 'Z': 0.1,
+}
+for k, v in ENG_FREQ.items():
+ ENG_FREQ[k] = v/100
+
+
+BF_NAMES = (
+ 'марширующе',
+ 'марширующий',
+ 'свободин',
+ 'мовсаев',
+ 'щиголев',
+ 'щиголёв',
+)
+
+def plural(n, words):
+ words = words.split(' ')
+ return words[0 if n == 1 else 1]
+
+# split text to sentences
+def split_sen(s):
+ s = s.strip()
+ lines = []
+
+ endings = ('. ', '? ', '! ')
+
+ pos = 0
+ while pos < len(s):
+ min_index = None
+ for end in endings:
+ try:
+ i = s.index(end, pos)
+ except ValueError:
+ continue
+ if min_index == None or i < min_index:
+ min_index = i
+
+ if min_index:
+ line = s[pos:min_index+1]
+ pos = min_index+2
+ else:
+ line = s[pos:]
+ pos = len(s)
+
+ lines.append(line)
+
+ result_lines = []
+ for line in lines:
+ words = re.split(r'\s+', line)
+ buf = []
+ for w in words:
+ if not re.sub(r'[\.\!\?]', '', w).isdigit():
+ buf.append(w)
+ else:
+ if len(buf):
+ result_lines.append(' '.join(buf))
+ result_lines.append(w)
+ buf = []
+ if len(buf):
+ result_lines.append(' '.join(buf))
+
+ return result_lines
+
+def analyze_sentences(lines):
+ max_len = 0
+ for line in lines:
+ if len(line) > max_len:
+ max_len = len(line)
+
+ max_len += 1
+
+ i = 1
+ for line in lines:
+ words = re.split(r'\s+', line)
+
+ cprint('%2d. ' % i, 'cyan', end='')
+
+ print(line, end='')
+ if len(line) < max_len:
+ print(' ' * (max_len-len(line)), end='')
+
+ cprint(str(len(words)), 'green', attrs=['bold'], end='')
+ cprint(' %s,' % plural(len(words), 'word words'), 'green', end='')
+
+ cprint(' %d' % len(line), 'yellow', attrs=['bold'], end='')
+ cprint('/', 'yellow', end='')
+ cprint('%d' % len(line.replace(' ', '')), 'yellow', attrs=['bold'], end='')
+
+ cprint(' chars ', 'yellow', end='')
+
+ unique = unique_letters_amount(line)
+ cprint('(', 'red', end='')
+ cprint(unique, 'red', attrs=['bold'], end='')
+ cprint(' unique)', 'red')
+
+ i += 1
+
+ cprint('Total unique characters: %d\n' % unique_letters_amount(''.join(lines)), 'white', attrs=['bold'], end='')
+
+
+def analyze_letters_frequency(s):
+ letters = []
+ s = s.lower()
+ s = s.replace(' ', '').replace('.', '').replace('?', '').replace(',', '').replace('!', '')
+ s = s.replace('ё', 'е').replace('ъ', 'ь')
+ s = s.replace("'", '')
+
+ added = []
+
+ for l in s:
+ if l not in added:
+ freq = s.count(l) / len(s)
+ letters.append(( l, freq ))
+ added.append(l)
+
+ letters = sorted(letters, key=operator.itemgetter(1), reverse=True)
+
+ repl_table = {}
+ in_table = []
+
+ for l in letters:
+ l, freq = l
+
+ closest_delta = 1
+ closest_letter = '?'
+ closest_tbl_freq = 0
+
+ for tbl_l, tbl_freq in RU_FREQ.items():
+ delta = abs(tbl_freq - freq)
+ if delta < closest_delta and tbl_l not in in_table:
+ closest_delta = delta
+ closest_letter = tbl_l
+ closest_tbl_freq = tbl_freq
+
+
+ repl_table[l] = closest_letter
+ in_table.append(closest_letter)
+ print('%s (%f) ---> %s (%f)' % (l, freq, closest_letter, closest_tbl_freq))
+
+ return repl_table
+
+def print_hl(s, tohl, end="\n"):
+ i = 0
+ indexes = []
+ for c in s:
+ if c != tohl:
+ print(c, end='')
+ else:
+ indexes.append(i)
+ cprint(c, 'red', attrs=['bold', 'underline'], end='')
+ i += 1
+ print('', end=end)
+ return indexes
+
+def rot_en(s):
+ alphabet = ''
+ for i in range(65, 91):
+ alphabet += chr(i)
+
+ s = s.upper()
+ for i in range(0, 26):
+ for letter in s:
+ if re.match('[\d\.,\_\- ]', letter):
+ print(letter, end='')
+ continue
+
+ letter_index = alphabet.index(letter)
+ new_index = (letter_index + i) % 26
+ new_letter = alphabet[new_index]
+ print(new_letter, end='')
+
+ print()
+
+def rot_ru(s):
+ alphabet = RU_ALPHABET.upper()
+
+ s = s.upper()
+ for i in range(0, 33):
+ for letter in s:
+ if re.match('[\d\.,\_\- ]', letter):
+ print(letter, end='')
+ continue
+
+ letter_index = alphabet.index(letter)
+ new_index = (letter_index + i) % 33
+ new_letter = alphabet[new_index]
+ print(new_letter, end='')
+
+ print()
+
+def bf_all(table):
+ return list(itertools.product(*table))
+
+def spaceitout(string,amount):
+ amountint = int(amount)
+ pile = ""
+ for char in string:
+ pile = pile + char + " "*amount
+ return pile.strip()
+
+def unique_letters_amount(line):
+ line = line.upper()
+ unique = 0
+ for a in RU_ALPHABET.upper():
+ if a in line:
+ unique += 1
+ return unique
+
+def bf_find_words(lines, words, nospaces=False):
+ min_word_len = 100
+ for w in words:
+ if len(w) < min_word_len:
+ min_word_len = len(w)
+
+ if nospaces:
+ lines = list(map(lambda s: re.sub(r'\s+', '', s), lines))
+
+ for line_start in range(0, len(lines) - min_word_len + 1):
+ cur_lines = lines[line_start:]
+ for w in words:
+ w = w.lower()
+ if len(w) > len(cur_lines):
+ continue
+
+ found = True
+ max_sen_len = 0
+ for i in range(0, len(w)):
+ line = cur_lines[i].lower()
+ if len(line) > max_sen_len:
+ max_sen_len = len(line)
+ if w[i] not in line:
+ found = False
+ break
+
+ if found:
+ # word <word> found in <n>-<m>
+ print('word ', end='')
+ cprint(w, 'white', attrs=['bold', 'underline'], end='')
+ print(' found in ', end='')
+ cprint(line_start+1, 'white', attrs=['bold'], end='')
+ print('-', end='')
+ cprint(line_start+len(w), 'white', attrs=['bold'], end='')
+ print(':')
+
+ for i in range(0, len(w)):
+ line = cur_lines[i]
+ print(' ', end='')
+ cprint('%2d. ' % (line_start+i+1), 'cyan', end='')
+ indexes = print_hl(line, w[i], end='')
+ if len(line) < max_sen_len:
+ print(' ' * (max_sen_len - len(line)), end='')
+
+ cprint(' %s. ' % (w[i]), 'cyan', end='')
+
+ indexes = tuple(map(lambda n: str(n+1), indexes))
+ for i, n in enumerate(indexes):
+ cprint(n, 'green', end='')
+ if i < len(indexes) - 1:
+ print(', ', end='')
+
+ print()
+
+ print()