-
-
Notifications
You must be signed in to change notification settings - Fork 46k
/
entropy.py
132 lines (111 loc) · 4.74 KB
/
entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""
Implementation of entropy of information
https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""
from __future__ import annotations
import math
from collections import Counter
from string import ascii_lowercase
def calculate_prob(text: str) -> None:
"""
This method takes path and two dict as argument
and than calculates entropy of them.
:param dict:
:param dict:
:return: Prints
1) Entropy of information based on 1 alphabet
2) Entropy of information based on couples of 2 alphabet
3) print Entropy of H(X n|Xn-1)
Text from random books. Also, random quotes.
>>> text = ("Behind Winston's back the voice "
... "from the telescreen was still "
... "babbling and the overfulfilment")
>>> calculate_prob(text)
4.0
6.0
2.0
>>> text = ("The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official"
... "face in elegant lettering, the three")
>>> calculate_prob(text)
4.0
5.0
1.0
>>> text = ("Had repulsive dashwoods suspicion sincerity but advantage now him. "
... "Remark easily garret nor nay. Civil those mrs enjoy shy fat merry. "
... "You greatest jointure saw horrible. He private he on be imagine "
... "suppose. Fertile beloved evident through no service elderly is. Blind "
... "there if every no so at. Own neglected you preferred way sincerity "
... "delivered his attempted. To of message cottage windows do besides "
... "against uncivil. Delightful unreserved impossible few estimating "
... "men favourable see entreaties. She propriety immediate was improving. "
... "He or entrance humoured likewise moderate. Much nor game son say "
... "feel. Fat make met can must form into gate. Me we offending prevailed "
... "discovery.")
>>> calculate_prob(text)
4.0
7.0
3.0
"""
single_char_strings, two_char_strings = analyze_text(text)
my_alphas = list(" " + ascii_lowercase)
# what is our total sum of probabilities.
all_sum = sum(single_char_strings.values())
# one length string
my_fir_sum = 0
# for each alpha we go in our dict and if it is in it we calculate entropy
for ch in my_alphas:
if ch in single_char_strings:
my_str = single_char_strings[ch]
prob = my_str / all_sum
my_fir_sum += prob * math.log2(prob) # entropy formula.
# print entropy
print(f"{round(-1 * my_fir_sum):.1f}")
# two len string
all_sum = sum(two_char_strings.values())
my_sec_sum = 0
# for each alpha (two in size) calculate entropy.
for ch0 in my_alphas:
for ch1 in my_alphas:
sequence = ch0 + ch1
if sequence in two_char_strings:
my_str = two_char_strings[sequence]
prob = int(my_str) / all_sum
my_sec_sum += prob * math.log2(prob)
# print second entropy
print(f"{round(-1 * my_sec_sum):.1f}")
# print the difference between them
print(f"{round((-1 * my_sec_sum) - (-1 * my_fir_sum)):.1f}")
def analyze_text(text: str) -> tuple[dict, dict]:
"""
Convert text input into two dicts of counts.
The first dictionary stores the frequency of single character strings.
The second dictionary stores the frequency of two character strings.
"""
single_char_strings = Counter() # type: ignore[var-annotated]
two_char_strings = Counter() # type: ignore[var-annotated]
single_char_strings[text[-1]] += 1
# first case when we have space at start.
two_char_strings[" " + text[0]] += 1
for i in range(len(text) - 1):
single_char_strings[text[i]] += 1
two_char_strings[text[i : i + 2]] += 1
return single_char_strings, two_char_strings
def main():
import doctest
doctest.testmod()
# text = (
# "Had repulsive dashwoods suspicion sincerity but advantage now him. Remark "
# "easily garret nor nay. Civil those mrs enjoy shy fat merry. You greatest "
# "jointure saw horrible. He private he on be imagine suppose. Fertile "
# "beloved evident through no service elderly is. Blind there if every no so "
# "at. Own neglected you preferred way sincerity delivered his attempted. To "
# "of message cottage windows do besides against uncivil. Delightful "
# "unreserved impossible few estimating men favourable see entreaties. She "
# "propriety immediate was improving. He or entrance humoured likewise "
# "moderate. Much nor game son say feel. Fat make met can must form into "
# "gate. Me we offending prevailed discovery. "
# )
# calculate_prob(text)
if __name__ == "__main__":
main()