Skip to content

Commit e454f35

Browse files
committed
working towards py3 compatibility
- fixed imports - print statements
1 parent 173fe9e commit e454f35

6 files changed

Lines changed: 172 additions & 169 deletions

File tree

wp_parser/ChatFeatures.py

Lines changed: 82 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,27 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import division
3-
import datelib
4-
import re
3+
54
import operator
65

7-
class ChatFeatures():
6+
import re
7+
from . import datelib
8+
89

10+
class ChatFeatures:
911
def __init__(self):
10-
self.root_response_time = []
12+
self.root_response_time = []
1113
self.contact_response_time = []
12-
self.root_burst = []
13-
self.contact_burst = []
14-
self.initiations = {}
15-
self.weekday = {}
16-
self.shifts = {}
17-
self.patterns = {}
18-
self.proportions = {}
19-
self.most_used_words = {}
20-
21-
def compute_response_time_and_burst(self, list_of_messages, root_name, senders, initiation_thrs=(60*60*8), burst_thrs=3, response_thrs=(60*60*3)):
14+
self.root_burst = []
15+
self.contact_burst = []
16+
self.initiations = {}
17+
self.weekday = {}
18+
self.shifts = {}
19+
self.patterns = {}
20+
self.proportions = {}
21+
self.most_used_words = {}
22+
23+
def compute_response_time_and_burst(self, list_of_messages, root_name, senders, initiation_thrs=(60 * 60 * 8),
24+
burst_thrs=3, response_thrs=(60 * 60 * 3)):
2225
# perform the operations that are dependant on multiple messages
2326
# (response time, bursts)
2427
self.initiations = {}
@@ -27,30 +30,30 @@ def compute_response_time_and_burst(self, list_of_messages, root_name, senders,
2730
t0 = list_of_messages[0].datetime_obj
2831
burst_count = 1
2932
for index, message in enumerate(list_of_messages):
30-
#skip the first message since we are looking at differences; note this means we don't count first msg as init
33+
# skip the first message since we are looking at differences; note this means we don't count first msg as init
3134
if index == 0:
3235
continue
3336
t1 = message.datetime_obj
3437
dt = t1 - t0
3538
dt.total_seconds()
3639

3740
# print "sender %s delta %s" % ( message.sender, dt.total_seconds() )
38-
if (dt.total_seconds() > initiation_thrs):
41+
if dt.total_seconds() > initiation_thrs:
3942
self.initiations[message.sender] += 1
4043

4144
# is sender the same as the last message?
42-
if message.sender != list_of_messages[index-1].sender:
45+
if message.sender != list_of_messages[index - 1].sender:
4346
# sender changed, store the burst count and reset
44-
#print "sender changed: %s" % ( message.sender )
45-
#print "burst count: %s" % ( burst_count )
47+
# print "sender changed: %s" % ( message.sender )
48+
# print "burst count: %s" % ( burst_count )
4649

47-
#print("response time: %d\n" %(dt.total_seconds()) )
50+
# print("response time: %d\n" %(dt.total_seconds()) )
4851
# is sender the root?
4952
if message.sender == root_name:
5053
# store the burst count for the last sender, which is the
5154
# opposite of current
5255
if burst_count > burst_thrs:
53-
#print "BURST CONTACT ENDED: %s IN A ROW" % ( burst_count )
56+
# print "BURST CONTACT ENDED: %s IN A ROW" % ( burst_count )
5457
self.contact_burst.append(burst_count)
5558
if dt.total_seconds() < response_thrs:
5659
self.root_response_time.append(dt.total_seconds())
@@ -59,24 +62,24 @@ def compute_response_time_and_burst(self, list_of_messages, root_name, senders,
5962
# store the burst count for the last sender, which is the
6063
# opposite of current
6164
if burst_count > burst_thrs:
62-
#print "BURST ROOT ENDED: %s IN A ROW" % ( burst_count )
65+
# print "BURST ROOT ENDED: %s IN A ROW" % ( burst_count )
6366
self.root_burst.append(burst_count)
6467
if dt.total_seconds() < response_thrs:
6568
self.contact_response_time.append(dt.total_seconds())
66-
69+
6770
# End of the first burst, restart the counter
6871
burst_count = 1
6972

7073
else:
7174
# accumulate the number of messages sent in a row
7275
burst_count += 1
7376
t0 = t1
74-
if burst_count > burst_thrs: #catch a burst if at end of chat
75-
#print "final burst: %s" % ( burst_count )
76-
if message.sender == root_name:
77+
if burst_count > burst_thrs: # catch a burst if at end of chat
78+
# print "final burst: %s" % ( burst_count )
79+
if message.sender == root_name:
7780
self.root_burst.append(burst_count)
7881
else:
79-
self.contact_burst.append(burst_count)
82+
self.contact_burst.append(burst_count)
8083

8184
def compute_messages_per_weekday(self, list_of_messages):
8285
self.weekday = {
@@ -105,16 +108,16 @@ def compute_messages_per_shift(self, list_of_messages):
105108
}
106109
for msg in list_of_messages:
107110
hour = int(msg.time.split(":")[0])
108-
if hour >= 0 and hour <= 6:
111+
if 0 <= hour <= 6:
109112
self.shifts["latenight"] += 1
110113

111-
elif hour > 6 and hour <= 11:
114+
elif 6 < hour <= 11:
112115
self.shifts["morning"] += 1
113116

114-
elif hour > 11 and hour <= 17:
117+
elif 11 < hour <= 17:
115118
self.shifts["afternoon"] += 1
116119

117-
elif hour > 17 and hour <= 23:
120+
elif 17 < hour <= 23:
118121
self.shifts["evening"] += 1
119122
return self.shifts
120123

@@ -134,7 +137,7 @@ def compute_messages_pattern(self, list_of_messages, senders, pattern_list):
134137
if length > 0:
135138
if pattern not in self.patterns:
136139
self.patterns[pattern][msg.sender] = length
137-
print "This should never happen"
140+
print("This should never happen")
138141
else:
139142
self.patterns[pattern][msg.sender] += length
140143
return self.patterns
@@ -149,10 +152,10 @@ def compute_message_proportions(self, list_of_messages, senders, root, contact):
149152
self.proportions[i][s] = 0
150153
for msg in list_of_messages:
151154
self.proportions["messages"][msg.sender] += 1
152-
self.proportions["words"][msg.sender] += len(msg.content.split(" "))
153-
self.proportions["chars"][msg.sender] += len(msg.content.strip())
154-
self.proportions["qmarks"][msg.sender] += msg.content.count('?')
155-
self.proportions["exclams"][msg.sender] += msg.content.count('!')
155+
self.proportions["words"][msg.sender] += len(msg.content.split(" "))
156+
self.proportions["chars"][msg.sender] += len(msg.content.strip())
157+
self.proportions["qmarks"][msg.sender] += msg.content.count('?')
158+
self.proportions["exclams"][msg.sender] += msg.content.count('!')
156159
self.proportions["media"][msg.sender] += (
157160
msg.content.count('<media omitted>') +
158161
msg.content.count('<image omitted>') +
@@ -170,24 +173,24 @@ def compute_message_proportions(self, list_of_messages, senders, root, contact):
170173
self.proportions["avg_words"] = {}
171174
for s in senders:
172175
self.proportions["avg_words"][s] = self.proportions["words"][s] / self.proportions["messages"][s]
173-
self.proportions["avg_words"]["ratio"] = self.proportions["avg_words"][root] / self.proportions["avg_words"][contact]
176+
self.proportions["avg_words"]["ratio"] = self.proportions["avg_words"][root] / self.proportions["avg_words"][
177+
contact]
174178

175179
for c in categories:
176180
self.proportions[c]["total"] = 0
177181
for s in senders:
178182
self.proportions[c]["total"] += self.proportions[c][s]
179-
183+
180184
for c in categories:
181-
182-
#if a value is 0, replace with a 1 to avoid zero erros in ratio calcs.
185+
186+
# if a value is 0, replace with a 1 to avoid zero erros in ratio calcs.
183187
if self.proportions[c][contact] == 0:
184188
self.proportions[c][contact] = 1
185189
if self.proportions[c][root] == 0:
186-
self.proportions[c][root] = 1
190+
self.proportions[c][root] = 1
187191

188192
self.proportions[c]["ratio"] = self.proportions[c][root] / self.proportions[c][contact]
189193

190-
191194
return self.proportions
192195

193196
def compute_most_used_words(self, list_of_messages, top=10, threshold=3):
@@ -204,37 +207,37 @@ def compute_most_used_words(self, list_of_messages, top=10, threshold=3):
204207
words_counter[w] = 1
205208
else:
206209
words_counter[w] += 1
207-
sorted_words = sorted(words_counter.iteritems(), key=operator.itemgetter(1), reverse=True)
210+
sorted_words = sorted(words_counter.items(), key=operator.itemgetter(1), reverse=True)
208211
self.most_used_words = sorted_words[:top]
209212
return self.most_used_words
210213

211214
def compute_avg_root_response_time(self):
212-
if (len(self.root_response_time) != 0):
213-
return sum(self.root_response_time)/len(self.root_response_time)
215+
if len(self.root_response_time) != 0:
216+
return sum(self.root_response_time) / len(self.root_response_time)
214217
return 0
215218

216219
def compute_avg_contact_response_time(self):
217-
if (len(self.contact_response_time) != 0):
218-
return sum(self.contact_response_time)/len(self.contact_response_time)
220+
if len(self.contact_response_time) != 0:
221+
return sum(self.contact_response_time) / len(self.contact_response_time)
219222
return 0
220223

221224
def compute_response_time_ratio(self, root, contact):
222225
avg_root = self.compute_avg_root_response_time()
223226
avg_contact = self.compute_avg_contact_response_time()
224-
if (avg_contact != 0):
227+
if avg_contact != 0:
225228
return avg_root / avg_contact
226229
return 0
227230

228231
def compute_bursts_ratio(self, root, contact):
229232
if (len(self.contact_burst)) == 0:
230233
return len(self.root_burst) / 1
231-
if (len(self.root_burst) == 0):
232-
return ( 1/len(self.contact_burst))
233-
return len(self.root_burst)/len(self.contact_burst)
234+
if len(self.root_burst) == 0:
235+
return 1 / len(self.contact_burst)
236+
return len(self.root_burst) / len(self.contact_burst)
234237

235238
def compute_nbr_root_burst(self):
236239
return len(self.root_burst)
237-
240+
238241
def compute_nbr_contact_burst(self):
239242
return len(self.contact_burst)
240243

@@ -244,48 +247,41 @@ def compute_nbr_contact_burst(self):
244247
# return 0
245248

246249
def compute_avg_contact_burst(self):
247-
if (len(self.contact_burst) != 0):
248-
return sum(self.contact_burst)/len(self.contact_burst)
250+
if len(self.contact_burst) != 0:
251+
return sum(self.contact_burst) / len(self.contact_burst)
249252
return 0
250253

251254
def compute_root_initation_ratio(self, root, contact):
252-
if (self.initiations[contact] == 0):
253-
return self.initiations[root]/1
254-
if (self.initiations[root] == 0):
255-
return 1/self.initiations[contact]
255+
if self.initiations[contact] == 0:
256+
return self.initiations[root] / 1
257+
if self.initiations[root] == 0:
258+
return 1 / self.initiations[contact]
256259
return self.initiations[root] / self.initiations[contact]
257-
260+
258261
def generate_outcome(self, root, contact, methodology):
259-
outcome = 99;
262+
outcome = 99
260263
if methodology == 0:
261-
if (self.compute_root_initation_ratio(root, contact) > 0.867):
262-
outcome = 0 #"just not that into you"
263-
#print "DOESNT INITIATE"
264-
elif (self.proportions["qmarks"]["ratio"] > 0.87): #flipped the non-intutitive direction of inequality
265-
outcome = 0 #"just not that into you"
266-
#print "QUESTIONS FAIL"
264+
if self.compute_root_initation_ratio(root, contact) > 0.867:
265+
outcome = 0 # "just not that into you"
266+
# print "DOESNT INITIATE"
267+
elif self.proportions["qmarks"]["ratio"] > 0.87: # flipped the non-intuitive direction of inequality
268+
outcome = 0 # "just not that into you"
269+
# print "QUESTIONS FAIL"
267270
else:
268-
outcome = 1 #"definitely into you"
269-
#print "ELSE"
271+
outcome = 1 # "definitely into you"
272+
# print "ELSE"
270273
elif methodology == 1:
271-
if (self.compute_root_initation_ratio(root, contact) > 0.83):
272-
outcome = 0 #"just not that into you"
273-
#print "DOESNT INITIATE"
274-
elif (self.features.compute_avg_root_response_time() < 0.92): #flipped non-intuitive direction of inequality
275-
outcome = 0 #"just not that into you"
276-
#print "QUESTIONS FAIL"
274+
if self.compute_root_initation_ratio(root, contact) > 0.83:
275+
outcome = 0 # "just not that into you"
276+
# print "DOESNT INITIATE"
277+
elif self.features.compute_avg_root_response_time() < 0.92: # flipped non-intuitive direction of inequality
278+
outcome = 0 # "just not that into you"
279+
# print "QUESTIONS FAIL"
277280
else:
278-
outcome = 1 #"definitely into you"
279-
#print "ELSE"
281+
outcome = 1 # "definitely into you"
282+
# print "ELSE"
280283

281284
else:
282-
outcome = 99;
283-
284-
return outcome
285-
286-
# qMarksPerRoot = qmarksRoot/messagesRoot
287-
# qMarksPerContact = qmarksContact/messagesContact
288-
289-
290-
291-
285+
outcome = 99
286+
287+
return outcome

wp_parser/datelib.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1+
import time
12
from datetime import date
23
from datetime import datetime
34
from datetime import timedelta
4-
import time
55

66

77
# get current ymd
@@ -37,11 +37,13 @@ def valid_date(date_str):
3737

3838
return valid
3939

40+
4041
def date_diff(dateobj1, dateobj2):
4142
import math
4243
delta = dateobj2 - dateobj1
4344
return int(math.fabs(delta.days))
4445

46+
4547
def datecmp(date1, date2):
4648
year, month, day = date_split(date1)
4749
year_t, month_t, day_t = date_split(date2)
@@ -53,8 +55,8 @@ def datecmp(date1, date2):
5355
else:
5456
return 1
5557
except ValueError:
56-
#misc.error("Fix me! Invalid date", "datecmp")
57-
print "Fix me! Invalid date"
58+
# misc.error("Fix me! Invalid date", "datecmp")
59+
print("Fix me! Invalid date")
5860
return False
5961

6062

@@ -65,7 +67,7 @@ def date_operation(date_str, num):
6567
return end_date
6668

6769

68-
def date_to_str(date_str):
70+
def date_to_str():
6971
return date.strftime('%Y-%m-%d')
7072

7173

@@ -89,7 +91,7 @@ def date_interval(initial_date, length, step=1, separator="-"):
8991
output = []
9092
current = start_date
9193
while current < end_date:
92-
output.append(date_to_str(current))
94+
output.append(date_to_str())
9395
current += timedelta(days=step)
9496

9597
return output
@@ -119,5 +121,6 @@ def weekday_portuguese_to_english(string):
119121
elif string == "sab" or string == "sabado":
120122
return "Saturday"
121123

124+
122125
if __name__ == "__main__":
123-
print date_diff(datetime(2015, 6, 4), datetime(2015, 07, 7))
126+
print(date_diff(datetime(2015, 6, 4), datetime(2015, 7, 7)))

wp_parser/parsers/facebook.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from datetime import datetime
2-
import message
32

4-
class ParserFacebook():
3+
from . import message
54

6-
''' A line is a dict object in this format:
5+
6+
class ParserFacebook:
7+
""" A line is a dict object in this format:
78
{u'message': u'text text', u'from': u'Username One', u'id':
89
u'3294659605566648_1432085429', u'datetime': u'2015-05-20T01:30:29+0000'}
9-
'''
10+
"""
1011

1112
def __init__(self, raw_messages):
1213
self.raw_messages = raw_messages

0 commit comments

Comments
 (0)