Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ src/features/lexicons/certainty.txt
examples/vector_data/*
examples/output/*
node_modules/
*.csv
*.log

# testing
/output
Expand Down
430 changes: 334 additions & 96 deletions src/team_comm_tools/feature_builder.py

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion src/team_comm_tools/utils/calculate_chat_level_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# Importing utils
from .preload_word_lists import *
from .zscore_chats_and_conversation import get_zscore_across_all_chats, get_zscore_across_all_conversations
from time import perf_counter

# Loading bar
from tqdm import tqdm
Expand Down Expand Up @@ -69,7 +70,8 @@ def __init__(
message_col: str,
timestamp_col: str | tuple[str, str],
timestamp_unit: str,
custom_liwc_dictionary: dict
custom_liwc_dictionary: dict,
logger: logging.Logger
) -> None:

self.chat_data = chat_data
Expand All @@ -86,6 +88,7 @@ def __init__(
self.function_words = get_function_words() # load function words exactly once
self.question_words = get_question_words() # load question words exactly once
self.first_person = get_first_person_words() # load first person words exactly once
self.logger = logger

def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame:
"""
Expand All @@ -99,7 +102,10 @@ def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame:
"""

for method in tqdm(feature_methods):
start_time = perf_counter()
method(self)
end_time = perf_counter()
self.logger.info(f" - {method.__name__}: {end_time - start_time:.2f} seconds.")

# Return the input dataset with the chat level features appended (as columns)
return self.chat_data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from team_comm_tools.utils.gini_coefficient import *
from team_comm_tools.utils.preprocess import *
from fuzzywuzzy import process
from time import perf_counter

class ConversationLevelFeaturesCalculator:
"""
Expand Down Expand Up @@ -57,6 +58,7 @@ def __init__(self, chat_data: pd.DataFrame,
user_methods: list,
user_columns: list,
chat_features: list,
logger
) -> None:

# Initializing variables
Expand All @@ -75,6 +77,7 @@ def __init__(self, chat_data: pd.DataFrame,
self.user_methods = user_methods
self.user_columns = user_columns
self.chat_features = chat_features
self.logger = logger

def clean_up_aggregation_method_names(aggregation_method_names:list, method_param:str) -> list:
"""
Expand Down Expand Up @@ -234,7 +237,10 @@ def calculate_conversation_level_features(self, feature_methods: list) -> pd.Dat
"""

for method in feature_methods:
start_time = perf_counter()
method(self)
end_time = perf_counter()
self.logger.info(f" - {method.__name__}: {end_time - start_time:.2f} seconds.")

return self.conv_data

Expand Down
21 changes: 18 additions & 3 deletions src/team_comm_tools/utils/calculate_user_level_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from team_comm_tools.features.get_user_network import *
from team_comm_tools.features.user_centroids import *
from fuzzywuzzy import process
from time import perf_counter

class UserLevelFeaturesCalculator:
"""
Expand Down Expand Up @@ -38,7 +39,8 @@ def __init__(self, chat_data: pd.DataFrame,
user_aggregation: bool,
user_methods: list,
user_columns: list,
chat_features: list) -> None:
chat_features: list,
logger) -> None:

# Initializing variables
self.chat_data = chat_data
Expand All @@ -49,6 +51,7 @@ def __init__(self, chat_data: pd.DataFrame,
self.user_aggregation = user_aggregation
self.user_methods = user_methods
self.chat_features = chat_features
self.logger = logger

def clean_up_aggregation_method_names(aggregation_method_names:list) -> list:
"""
Expand Down Expand Up @@ -152,16 +155,28 @@ def calculate_user_level_features(self) -> pd.DataFrame:
"""

# Get total counts for features that need to be summed, regardless of what the user specified
start_time = perf_counter()
self.get_user_level_summed_features()

end_time = perf_counter()
self.logger.info(f" - user_level_summed_features: {end_time - start_time:.2f} seconds.")

# Get user summary statistics for all features (e.g. mean, min, max, stdev)
start_time = perf_counter()
self.get_user_level_summary_statistics_features()

end_time = perf_counter()
self.logger.info(f" - user_level_summary_statistics_features: {end_time - start_time:.2f} seconds.")

# Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range)
start_time = perf_counter()
self.get_centroids()
end_time = perf_counter()
self.logger.info(f" - user_centroids: {end_time - start_time:.2f} seconds.")

# Get list of other users in a given conversation
start_time = perf_counter()
self.get_user_network()
end_time = perf_counter()
self.logger.info(f" - user_network: {end_time - start_time:.2f} seconds.")

return self.user_data

Expand Down
45 changes: 36 additions & 9 deletions src/team_comm_tools/utils/check_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
import warnings
from tqdm import tqdm
from pathlib import Path
from time import perf_counter

import torch
from sentence_transformers import SentenceTransformer, util
from torch import cuda, no_grad
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, logging
from scipy.special import softmax
from transformers import logging

logging.set_verbosity(40) # only log errors

Expand All @@ -26,7 +25,7 @@

# Check if embeddings exist
def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, need_sentence: bool,
need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str = "message"):
need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str, logger):
"""
Check if embeddings and required lexicons exist, and generate them if they don't.

Expand All @@ -49,43 +48,71 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, ne
:type use_gpu: bool
:param message_col: A string representing the column name that should be selected as the message. Defaults to "message".
:type message_col: str, optional
:param logger: Logger for logging messages
:type logger: logging.Logger

:return: None
:rtype: None
"""
device = "cpu"
if use_gpu:
if torch.cuda.is_available():
if cuda.is_available():
print("Using GPU for embeddings.")
logger.info("Using GPU for embeddings.")
device = "cuda"
else:
print("GPU not available, using CPU for embeddings.")
logger.info("GPU not available, using CPU for embeddings.")

if (regenerate_vectors or (not os.path.isfile(vect_path))) and need_sentence:
logger.info("Generating sentence vectors cache...")
start_time = perf_counter()
generate_vect(chat_data, vect_path, message_col, device)
end_time = perf_counter()
logger.info(f"Sentence vectors generation completed in {end_time - start_time:.2f} seconds.")
if (regenerate_vectors or (not os.path.isfile(bert_path))) and need_sentiment:
logger.info("Generating BERT vectors cache...")
start_time = perf_counter()
generate_bert(chat_data, bert_path, message_col, device)
end_time = perf_counter()
logger.info(f"BERT vectors generation completed in {end_time - start_time:.2f} seconds.")

try:
vector_df = pd.read_csv(vect_path)
# check whether the given vector and bert data matches length of chat data
if len(vector_df) != len(chat_data):
print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...")
logger.error("The length of the vector data does not match the length of the chat data. Regenerating...")
start_time = perf_counter()
generate_vect(chat_data, vect_path, message_col, device)
end_time = perf_counter()
logger.info(f"Sentence vectors regeneration completed in {end_time - start_time:.2f} seconds.")
except FileNotFoundError: # It's OK if we don't have the path, if the sentence vectors are not necessary
if need_sentence:
logger.error("Vector embeddings file not found. Generating new vector embeddings.")
start_time = perf_counter()
generate_vect(chat_data, vect_path, message_col, device)
end_time = perf_counter()
logger.info(f"Sentence vectors generation completed in {end_time - start_time:.2f} seconds.")

try:
bert_df = pd.read_csv(bert_path)
if len(bert_df) != len(chat_data):
print("ERROR: The length of the sentiment data does not match the length of the chat data. Regenerating...")
logger.error("The length of the sentiment data does not match the length of the chat data. Regenerating...")
# delete the file
start_time = perf_counter()
generate_bert(chat_data, bert_path, message_col, device)
end_time = perf_counter()
logger.info(f"BERT vectors regeneration completed in {end_time - start_time:.2f} seconds.")
except FileNotFoundError:
if need_sentiment: # It's OK if we don't have the path, if the sentiment features are not necessary
logger.error("BERT sentiment file not found. Generating new BERT sentiments.")
start_time = perf_counter()
generate_bert(chat_data, bert_path, message_col, device)

end_time = perf_counter()
logger.info(f"BERT vectors generation completed in {end_time - start_time:.2f} seconds.")

# Get the lexicon pickle(s) if they don't exist
current_script_directory = Path(__file__).resolve().parent
LEXICON_PATH_STATIC = current_script_directory.parent/"features/assets/lexicons_dict.pkl"
Expand Down Expand Up @@ -448,7 +475,7 @@ def get_sentiment(texts, model_bert, device):

encoded = tokenizer(non_null_non_empty_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
encoded = {k: v.to(device) for k, v in encoded.items()}
with torch.no_grad():
with no_grad():
output = model_bert(**encoded)

scores = output[0].detach().cpu().numpy()
Expand Down
34 changes: 33 additions & 1 deletion src/team_comm_tools/utils/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
import logging
import pandas as pd
# import warnings
import os

EMOJIS = {
"(:", "(;", "):", "/:", ":(", ":)", ":/", ";)", # 8 emojis from LIWC 2017
Expand Down Expand Up @@ -296,3 +297,34 @@ def create_cumulative_rows(input_df, conversation_id, timestamp_col, grouping_ke
)

return result_df

def setup_logger(name: str, log_file_path: str, level: int=logging.INFO):
"""Set up a logger

:param name: The name of the logger.
:type name: str
:param log_file_path: Path to the log file, such as './output/logs/feature_builder.log'.
:type log_file_path: str
:param level: Logging level, defaults to logging.INFO. All levels: 0: NOTSET, 10: DEBUG, 20: INFO, 30: WARNING, 40: ERROR, 50: CRITICAL.
:type level: int, optional
:return: Configured logger.
:rtype: logging.Logger
"""
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
log_dir = os.path.dirname(log_file_path)
if log_dir:
os.makedirs(log_dir, exist_ok=True)
logger = logging.getLogger(name)
logger.setLevel(level)
# Prevent “double logging” via parent/root handlers
logger.propagate = False
abs_path = os.path.abspath(log_file_path)
# If a FileHandler for this same file already exists, don’t add another
for h in logger.handlers:
if isinstance(h, logging.FileHandler) and os.path.abspath(getattr(h, "baseFilename", "")) == abs_path:
return logger
handler = logging.FileHandler(log_file_path)
handler.setFormatter(formatter)
logger.addHandler(handler)

return logger
Loading