forked from oster/rtce-experiments-toolbox
-
Notifications
You must be signed in to change notification settings - Fork 1
/
get-Word-Count-revisions-every-mn.py
executable file
·158 lines (121 loc) · 4.7 KB
/
get-Word-Count-revisions-every-mn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#! /usr/bin/python -tt
import os
import re
import json
import requests
import gzip
from datetime import datetime, timedelta
import time
from collections import Counter
last_uid = -1
user_ids = {}
def reset_user_ids():
global last_uid
global user_ids
last_uid = -1
user_ids = {}
def generate_user_id(str_uid):
global last_uid
global user_ids
if str_uid in user_ids:
return user_ids[str_uid]
else:
last_uid = last_uid + 1
user_ids[str_uid] = last_uid
return last_uid
def load_pad_revisions(db_file, padid):
with gzip.open(db_file, 'rb') as f:
revisions = []
pattern_rev_metadata = re.compile('{"key":"pad:'+padid+':revs:([0-9]+)"')
pattern_rev_data = re.compile('pad:'+padid)
while True:
try:
line = f.next()
match = pattern_rev_metadata.match(line)
if match:
rev = int(match.group(1)) #-> to int ?
rev_partA = json.loads(line)
rev_partB = json.loads(f.next())
if not pattern_rev_data.match(rev_partB['key']):
raise Exception('missing pad content for rev:%i' % (rev))
if rev_partB['val']['head'] != rev:
raise Exception( 'head (%i) does not match with current rev (%i)' % (rev_partB['val']['head'], rev))
revision = {}
revision['rev'] = rev
revision['timestamp'] = rev_partA['val']['meta']['timestamp']
revision['datetime'] = datetime.fromtimestamp(int(revision['timestamp']) / 1000)
revision['author'] = rev_partA['val']['meta']['author']
revision['content'] = rev_partB['val']['atext']['text']
revisions.append(revision)
except StopIteration:
break
return revisions
def format_time(timestamp):
return timestamp.strftime('%H:%M:%S')
def get_revision(revisions, revision_num):
return [ rev for rev in revisions if rev['rev'] == revision_num ][0]
def get_revision_at_time(revisions, certain_time):
return [ rev for rev in revisions if rev['datetime'] <= certain_time ][-1]
def get_revision_Word_Count(rev):
s=rev['content'].encode('utf8')
#string = input(s)
noWords = len(s.split())
return noWords
# def get_revision_Kandel_Moles(rev):
# payload = { 'sourcetext': rev['content'].encode('utf8') , 'calcmethod': 'kandelmoles', 'calcbutton': 'Calculate+score' }
# r = requests.post("http://www.standards-schmandards.com/exhibits/rix/index.php", data=payload)
# pattern = re.compile('Kandel & Moles score: <strong>(-?[0-9]+)</strong>.')
# res = pattern.search(r.text)
# return res.group(1)
def write_word_count_all_revisions(file, num, experiment_name):
revisions = load_pad_revisions(INPUT_DATA_PATH + num + '/dirty.db.gz', experiment_name + num)
initial_doc_rev = get_revision(revisions, initial_doc_rev_num)
first_changes_rev = get_revision(revisions, first_changes_rev_num)
end_of_audio_rev = get_revision(revisions, end_of_audio_rev_num)
print "= Group: %s" % (num)
#print "== first version - rev: %s (%s)" % (initial_doc_rev['rev'], format_time(initial_doc_rev['datetime']))
#print "== first change version - rev: %s (%s)" % (first_changes_rev['rev'], format_time(first_changes_rev['datetime']))
#print "== end of audio version - rev: %s (%s)" % (end_of_audio_rev['rev'], format_time(end_of_audio_rev['datetime']))
file.write(str(int(num)))
selected_revs = [ get_revision_at_time(revisions, first_changes_rev['datetime'] + timedelta(minutes=x)) for x in xrange(1,16)]
wordCounts=[]
i=1
for rev in selected_revs:
print "== %i min. after - rev: %s (%s)" % (i, rev['rev'], format_time(rev['datetime']))
wordCounts.append(get_revision_Word_Count(rev))
i=i+1
for count in wordCounts:
file.write( ", "+ str(count));
file.write("\n");
INPUT_DATA_PATH='./DATA-by-num/'
INPUT_DATA_JSON_FILE='./chat-slicing-data-notes.json'
OUTPUT_DATA_FILE='./DATA-results/WordCount.csv'
# data_json = '''{
# "004": {
# "notes": {
# "init-rev": 3 ,
# "first-change-rev": 8 ,
# "end-of-audio-rev": 488
# }
# }
# }'''
# data = json.loads(data_json)
with open(INPUT_DATA_JSON_FILE, "r") as json_data_file:
data = json.loads(json_data_file.read())
with open(OUTPUT_DATA_FILE, "w") as w:
w.write("Group")
for x in xrange(1, 16):
w.write(", Score-"+str(x)+"_mn")
w.write("\n")
for group in sorted(data.keys()):
#for group in ['019']:
for experiment in data[group].keys():
if group == '014' and (experiment == "corrections" or experiment == "films"):
num = '015'
else:
num = group
reset_user_ids()
initial_doc_rev_num = data[group][experiment]["init-rev"]
first_changes_rev_num = data[group][experiment]["first-change-rev"]
end_of_audio_rev_num = data[group][experiment]["end-of-audio-rev"]
write_word_count_all_revisions(w, num, experiment)