-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutility.py
More file actions
93 lines (74 loc) · 2.87 KB
/
utility.py
File metadata and controls
93 lines (74 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import json
from pathlib import Path
import os
import glob
def load_json_data(file):
with open(file, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def write_json_data(file, data):
with open(file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
def stopwords(nlp, custom_stop):
if custom_stop:
nlp.Defaults.stop_words |= custom_stop
return nlp.Defaults.stop_words
def get_fname(fpath:str)->str:
return(Path(fpath).stem)
# Return all json files present in the current working directory
def get_json_files()->list:
# absolute path to search all text files inside a specific folder
cur_dir = os.getcwd()
path = cur_dir + '/' + '*.json'
return glob.glob(path)
# Traverse a number range from start to end inclusive. This is different from
# range(), which only traverses to end-1
def Interval(start:int, end:int, step:int=1):
i = start
while i < end:
yield i
i += step
yield end
# Calculates the maximum number of columns in a csv file and returns a generated
# list of column names. Use the returned value in the 'names=' parameter of
# a Pandas read_csv call.
def max_csv_columns(fname:str, delimiter)->list:
# The max column count a line in the file could have
largest_column_count = 0
# Loop the data lines
with open(fname, 'r') as temp_f:
# Read the lines
lines = temp_f.readlines()
for l in lines:
# Count the column count for the current line
column_count = len(l.split(delimiter)) + 1
# Set the new most column count
largest_column_count = column_count if largest_column_count < column_count else largest_column_count
# Generate column names (will be 0, 1, 2, ..., largest_column_count - 1)
column_names = [i for i in range(0, largest_column_count)]
return column_names
"""
Chunks a given text into segments of a specified number of words.
Args:
paras (list): The list of input text to be chunked.
chunk_size_words (int): The approximate number of words per chunk.
Returns:
list: A list of strings, where each string is a text chunk.
"""
def chunk_text_by_words(para_list:list, chunk_size_words:int=256)->list:
chunks = []
for para in para_list:
words = para.split() # Split the text into a list of words
current_chunk_words = []
current_word_count = 0
for word in words:
current_chunk_words.append(word)
current_word_count += 1
if current_word_count >= chunk_size_words:
chunks.append(' '.join(current_chunk_words))
current_chunk_words = []
current_word_count = 0
# Add any remaining words as the last chunk
if current_chunk_words:
chunks.append(' '.join(current_chunk_words))
return chunks