Watts-Lab · sundy1994 · Jan 7, 2026 · Apr 6, 2026 · May 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -54,6 +54,8 @@ src/features/lexicons/certainty.txt
 examples/vector_data/*
 examples/output/*
 node_modules/
+*.csv
+*.log
 
 # testing
 /output

diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py
diff --git a/src/team_comm_tools/utils/calculate_chat_level_features.py b/src/team_comm_tools/utils/calculate_chat_level_features.py
@@ -19,6 +19,7 @@
 # Importing utils
 from .preload_word_lists import *
 from .zscore_chats_and_conversation import get_zscore_across_all_chats, get_zscore_across_all_conversations
+from time import perf_counter
 
 # Loading bar
 from tqdm import tqdm
@@ -69,7 +70,8 @@ def __init__(
             message_col: str,
             timestamp_col: str | tuple[str, str],
             timestamp_unit: str,
-            custom_liwc_dictionary: dict
+            custom_liwc_dictionary: dict,
+            logger: logging.Logger
     ) -> None:
 
         self.chat_data = chat_data
@@ -86,6 +88,7 @@ def __init__(
         self.function_words = get_function_words() # load function words exactly once
         self.question_words = get_question_words() # load question words exactly once
         self.first_person = get_first_person_words() # load first person words exactly once
+        self.logger = logger
 
     def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame:
         """
@@ -99,7 +102,10 @@ def calculate_chat_level_features(self, feature_methods: list) -> pd.DataFrame:
         """
 
         for method in tqdm(feature_methods):
+            start_time = perf_counter()
             method(self)
+            end_time = perf_counter()
+            self.logger.info(f"  - {method.__name__}: {end_time - start_time:.2f} seconds.")
 
         # Return the input dataset with the chat level features appended (as columns)
         return self.chat_data

diff --git a/src/team_comm_tools/utils/calculate_conversation_level_features.py b/src/team_comm_tools/utils/calculate_conversation_level_features.py
@@ -8,6 +8,7 @@
 from team_comm_tools.utils.gini_coefficient import *
 from team_comm_tools.utils.preprocess import *
 from fuzzywuzzy import process
+from time import perf_counter
 
 class ConversationLevelFeaturesCalculator:
     """
@@ -57,6 +58,7 @@ def __init__(self, chat_data: pd.DataFrame,
                         user_methods: list,
                         user_columns: list,
                         chat_features: list,
+                        logger
                         ) -> None:
 
         # Initializing variables
@@ -75,6 +77,7 @@ def __init__(self, chat_data: pd.DataFrame,
         self.user_methods = user_methods
         self.user_columns = user_columns
         self.chat_features = chat_features
+        self.logger = logger
 
         def clean_up_aggregation_method_names(aggregation_method_names:list, method_param:str) -> list:
             """
@@ -234,7 +237,10 @@ def calculate_conversation_level_features(self, feature_methods: list) -> pd.Dat
         """
 
         for method in feature_methods:
+            start_time = perf_counter()
             method(self)
+            end_time = perf_counter()
+            self.logger.info(f"  - {method.__name__}: {end_time - start_time:.2f} seconds.")
 
         return self.conv_data
 

diff --git a/src/team_comm_tools/utils/calculate_user_level_features.py b/src/team_comm_tools/utils/calculate_user_level_features.py
@@ -3,6 +3,7 @@
 from team_comm_tools.features.get_user_network import *
 from team_comm_tools.features.user_centroids import *
 from fuzzywuzzy import process
+from time import perf_counter
 
 class UserLevelFeaturesCalculator:
     """
@@ -38,7 +39,8 @@ def __init__(self, chat_data: pd.DataFrame,
                         user_aggregation: bool,
                         user_methods: list,
                         user_columns: list,
-                        chat_features: list) -> None:
+                        chat_features: list,
+                        logger) -> None:
 
         # Initializing variables
         self.chat_data = chat_data
@@ -49,6 +51,7 @@ def __init__(self, chat_data: pd.DataFrame,
         self.user_aggregation = user_aggregation
         self.user_methods = user_methods
         self.chat_features = chat_features
+        self.logger = logger
 
         def clean_up_aggregation_method_names(aggregation_method_names:list) -> list:
             """
@@ -152,16 +155,28 @@ def calculate_user_level_features(self) -> pd.DataFrame:
         """
 
         # Get total counts for features that need to be summed, regardless of what the user specified
+        start_time = perf_counter()
         self.get_user_level_summed_features()
-
+        end_time = perf_counter()
+        self.logger.info(f"  - user_level_summed_features: {end_time - start_time:.2f} seconds.")
+
         # Get user summary statistics for all features (e.g. mean, min, max, stdev)
+        start_time = perf_counter()
         self.get_user_level_summary_statistics_features()
-
+        end_time = perf_counter()
+        self.logger.info(f"  - user_level_summary_statistics_features: {end_time - start_time:.2f} seconds.")
+
         # Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range)
+        start_time = perf_counter()
         self.get_centroids()
+        end_time = perf_counter()
+        self.logger.info(f"  - user_centroids: {end_time - start_time:.2f} seconds.")
 
         # Get list of other users in a given conversation
+        start_time = perf_counter()
         self.get_user_network()
+        end_time = perf_counter()
+        self.logger.info(f"  - user_network: {end_time - start_time:.2f} seconds.")
 
         return self.user_data
 

diff --git a/src/team_comm_tools/utils/check_embeddings.py b/src/team_comm_tools/utils/check_embeddings.py
@@ -6,14 +6,13 @@
 import warnings
 from tqdm import tqdm
 from pathlib import Path
+from time import perf_counter
 
-import torch
-from sentence_transformers import SentenceTransformer, util
+from torch import cuda, no_grad
+from sentence_transformers import SentenceTransformer
 
-from transformers import AutoTokenizer
-from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, logging
 from scipy.special import softmax
-from transformers import logging
 
 logging.set_verbosity(40) # only log errors
 
@@ -26,7 +25,7 @@
 
 # Check if embeddings exist
 def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, need_sentence: bool, 
-                     need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str = "message"):
+                     need_sentiment: bool, regenerate_vectors: bool, use_gpu: bool, message_col: str, logger):
     """
     Check if embeddings and required lexicons exist, and generate them if they don't.
 
@@ -49,43 +48,71 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, ne
     :type use_gpu: bool
     :param message_col: A string representing the column name that should be selected as the message. Defaults to "message".
     :type message_col: str, optional
+    :param logger: Logger for logging messages
+    :type logger: logging.Logger
 
     :return: None
     :rtype: None
     """
     device = "cpu"
     if use_gpu:
-        if torch.cuda.is_available():
+        if cuda.is_available():
             print("Using GPU for embeddings.")
+            logger.info("Using GPU for embeddings.")
             device = "cuda"
         else:
             print("GPU not available, using CPU for embeddings.")
+            logger.info("GPU not available, using CPU for embeddings.")
 
     if (regenerate_vectors or (not os.path.isfile(vect_path))) and need_sentence:
+        logger.info("Generating sentence vectors cache...")
+        start_time = perf_counter()
         generate_vect(chat_data, vect_path, message_col, device)
+        end_time = perf_counter()
+        logger.info(f"Sentence vectors generation completed in {end_time - start_time:.2f} seconds.")
     if (regenerate_vectors or (not os.path.isfile(bert_path))) and need_sentiment:
+        logger.info("Generating BERT vectors cache...")
+        start_time = perf_counter()
         generate_bert(chat_data, bert_path, message_col, device)
+        end_time = perf_counter()
+        logger.info(f"BERT vectors generation completed in {end_time - start_time:.2f} seconds.")
 
     try:
         vector_df = pd.read_csv(vect_path)
         # check whether the given vector and bert data matches length of chat data 
         if len(vector_df) != len(chat_data):
             print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...")
+            logger.error("The length of the vector data does not match the length of the chat data. Regenerating...")
+            start_time = perf_counter()
             generate_vect(chat_data, vect_path, message_col, device)
+            end_time = perf_counter()
+            logger.info(f"Sentence vectors regeneration completed in {end_time - start_time:.2f} seconds.")
     except FileNotFoundError: # It's OK if we don't have the path, if the sentence vectors are not necessary
         if need_sentence:
+            logger.error("Vector embeddings file not found. Generating new vector embeddings.")
+            start_time = perf_counter()
             generate_vect(chat_data, vect_path, message_col, device)
+            end_time = perf_counter()
+            logger.info(f"Sentence vectors generation completed in {end_time - start_time:.2f} seconds.")
 
     try:
         bert_df = pd.read_csv(bert_path)
         if len(bert_df) != len(chat_data):
             print("ERROR: The length of the sentiment data does not match the length of the chat data. Regenerating...")
+            logger.error("The length of the sentiment data does not match the length of the chat data. Regenerating...")
             # delete the file
+            start_time = perf_counter()
             generate_bert(chat_data, bert_path, message_col, device)
+            end_time = perf_counter()
+            logger.info(f"BERT vectors regeneration completed in {end_time - start_time:.2f} seconds.")
     except FileNotFoundError:
         if need_sentiment: # It's OK if we don't have the path, if the sentiment features are not necessary
+            logger.error("BERT sentiment file not found. Generating new BERT sentiments.")
+            start_time = perf_counter()
             generate_bert(chat_data, bert_path, message_col, device)
-
+            end_time = perf_counter()
+            logger.info(f"BERT vectors generation completed in {end_time - start_time:.2f} seconds.")
+
     # Get the lexicon pickle(s) if they don't exist
     current_script_directory = Path(__file__).resolve().parent
     LEXICON_PATH_STATIC = current_script_directory.parent/"features/assets/lexicons_dict.pkl"
@@ -448,7 +475,7 @@ def get_sentiment(texts, model_bert, device):
 
     encoded = tokenizer(non_null_non_empty_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
     encoded = {k: v.to(device) for k, v in encoded.items()}
-    with torch.no_grad():
+    with no_grad():
         output = model_bert(**encoded)
 
     scores = output[0].detach().cpu().numpy()

diff --git a/src/team_comm_tools/utils/preprocess.py b/src/team_comm_tools/utils/preprocess.py
@@ -1,6 +1,7 @@
 import re
+import logging
 import pandas as pd
-# import warnings
+import os
 
 EMOJIS = {
     "(:", "(;", "):", "/:", ":(", ":)", ":/", ";)", # 8 emojis from LIWC 2017
@@ -296,3 +297,34 @@ def create_cumulative_rows(input_df, conversation_id, timestamp_col, grouping_ke
               )
 
     return result_df
+
+def setup_logger(name: str, log_file_path: str, level: int=logging.INFO):
+    """Set up a logger
+
+    :param name: The name of the logger.
+    :type name: str
+    :param log_file_path: Path to the log file, such as './output/logs/feature_builder.log'.
+    :type log_file_path: str
+    :param level: Logging level, defaults to logging.INFO. All levels: 0: NOTSET, 10: DEBUG, 20: INFO, 30: WARNING, 40: ERROR, 50: CRITICAL.
+    :type level: int, optional
+    :return: Configured logger.
+    :rtype: logging.Logger
+    """
+    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
+    log_dir = os.path.dirname(log_file_path)
+    if log_dir:
+        os.makedirs(log_dir, exist_ok=True)
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    # Prevent “double logging” via parent/root handlers
+    logger.propagate = False
+    abs_path = os.path.abspath(log_file_path)
+    # If a FileHandler for this same file already exists, don’t add another
+    for h in logger.handlers:
+        if isinstance(h, logging.FileHandler) and os.path.abspath(getattr(h, "baseFilename", "")) == abs_path:
+            return logger
+    handler = logging.FileHandler(log_file_path)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    return logger