BERTopic-Topic-Modeling/utility.py at main · DWHowes/BERTopic-Topic-Modeling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import json
from pathlib import Path
import os
import glob

def load_json_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def write_json_data(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

def stopwords(nlp, custom_stop):
    if custom_stop:
        nlp.Defaults.stop_words |= custom_stop
    return nlp.Defaults.stop_words

def get_fname(fpath:str)->str:
    return(Path(fpath).stem)

# Return all json files present in the current working directory
def get_json_files()->list:
    # absolute path to search all text files inside a specific folder
    cur_dir = os.getcwd()
    path = cur_dir + '/' + '*.json'
    return glob.glob(path)

# Traverse a number range from start to end inclusive. This is different from
# range(), which only traverses to end-1
def Interval(start:int, end:int, step:int=1):
    i = start
    while i < end:
        yield i
        i += step
    yield end

# Calculates the maximum number of columns in a csv file and returns a generated
# list of column names. Use the returned value in the 'names=' parameter of
# a Pandas read_csv call.
def max_csv_columns(fname:str, delimiter)->list:
    # The max column count a line in the file could have
    largest_column_count = 0

    # Loop the data lines
    with open(fname, 'r') as temp_f:
        # Read the lines
        lines = temp_f.readlines()

        for l in lines:
            # Count the column count for the current line
            column_count = len(l.split(delimiter)) + 1

            # Set the new most column count
            largest_column_count = column_count if largest_column_count < column_count else largest_column_count

    # Generate column names (will be 0, 1, 2, ..., largest_column_count - 1)
    column_names = [i for i in range(0, largest_column_count)]

    return column_names

"""
Chunks a given text into segments of a specified number of words.

Args:
    paras (list): The list of input text to be chunked.
    chunk_size_words (int): The approximate number of words per chunk.

Returns:
    list: A list of strings, where each string is a text chunk.
"""
def chunk_text_by_words(para_list:list, chunk_size_words:int=256)->list:
    chunks = []
    for para in para_list:
        words = para.split()  # Split the text into a list of words
        current_chunk_words = []
        current_word_count = 0

        for word in words:
            current_chunk_words.append(word)
            current_word_count += 1

            if current_word_count >= chunk_size_words:
                chunks.append(' '.join(current_chunk_words))
                current_chunk_words = []
                current_word_count = 0

        # Add any remaining words as the last chunk
        if current_chunk_words:
            chunks.append(' '.join(current_chunk_words))

    return chunks