-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcode_clean_utils.py
More file actions
165 lines (135 loc) · 4.92 KB
/
code_clean_utils.py
File metadata and controls
165 lines (135 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding: utf-8 -*-
import re
import codecs
import mysql.connector
SPLIT_CHARS = [',','+','&','!','%','?','_','|',':','-','=','\\','~','*','^','<','>','[',']','$','{','}',';','.','`','@','(',')']
_WORD_SPLIT = re.compile(b"([,+\-&!%'_?|=\s/\*^<>$@\[\](){}#;])")
# all kind of split char
def get_split_set():
split_set = set()
for chars in SPLIT_CHARS:
split_set.add(chars)
# remove all c/c++ comments from code
def remove_cpp_comment(code):
def blotOutNonNewlines(strIn): # Return a string containing only the newline chars contained in strIn
return "" + ("\n" * strIn.count('\n'))
def replacer(match):
s = match.group(0)
if s.startswith('/'): # Matched string is //...EOL or /*...*/ ==> Blot out all non-newline chars
return blotOutNonNewlines(s)
else: # Matched string is '...' or "..." ==> Keep unchanged
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, code)
#remove non ASCII chars
def replace_trash(unicode_string):
for i in range(0, len(unicode_string)):
try:
unicode_string[i].encode("ascii")
except:
# means it's non-ASCII
unicode_string = "" # replacing it with a single space
return unicode_string
#remove non ascii code from text
def remove_non_ascii(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
#line count
def line_count(file):
count = 0
the_file = open(file, 'rb')
while True:
buffer = the_file.read(8192 * 1024)
if not buffer:
break
count += buffer.count('\n')
the_file.close()
return count
# remove standard IO and File operation line
def code_clean(text):
patternBlank = re.compile(' +')
patternDouble2 = re.compile('\\n\s*\\n')
patternPrintf = re.compile('.*?print.*\r?\n')
patternCout = re.compile('.*?cout.*\r?\n')
patternCin = re.compile('.*?cin.*\r?\n')
patternScanf = re.compile('.*?scanf.*\r?\n')
patternTab = re.compile('\t')
patternReader = re.compile('.*?Reader.*\r?\n')
patternStream = re.compile('.*?Stream.*\r?\n')
patternWriter = re.compile('.*?Writer.*\r?\n')
a = re.sub(patternBlank, " ", text)
a = re.sub(patternDouble2, "\n", a)
a = re.sub(patternPrintf, "", a)
a = re.sub(patternCout, "", a)
a = re.sub(patternCin, "", a)
a = re.sub(patternScanf, "", a)
a = re.sub(patternTab, ' ', a)
a = re.sub(patternReader, "", a)
a = re.sub(patternStream, "", a)
a = re.sub(patternWriter, "", a)
return re.sub(r'[\xa0\s]+', ' ', a)
# remove muilti-blanks and new lines to single
def remove_blanks(text):
patternDouble = re.compile('\\n')
e = re.sub(patternDouble, " ", text)
patternBlank = re.compile(' +')
result = re.sub(patternBlank, ' ', e)
return result
# make code anonymous, such as all number replaced by NUMBER
# all string replaced by STRING, all variable changed to VAR,etc
def code_anonymous(code):
f = codecs.open('JoernAnalyzeStackOverflowCode/code_info/c_keyWord', 'r', 'utf8')
lines = f.readlines()
keyword = set()
for line in lines:
keyword.add(line.encode('utf-8').replace('\n', ''))
# repalce string
patterString = re.compile("\"(.*?)\"")
code = re.sub(patterString, "STRING", code)
# split by slicers
codes = _WORD_SPLIT.split(code)
final_code = ''
for code in codes:
if ((code == ' ') | (keyword.__contains__(code))):
final_code += code
elif (code != ''):
# replace number
if (code.isdigit()):
final_code += 'NUMBER'
# if the variable or function's name is long, we keep it
elif (code.__len__() >= 3):
final_code += code
# if the variable or function's name is short, we replace it
elif (code.__len__() < 3):
final_code += ' VAR '
return final_code
# make sure every split char is blan
def get_normalize_code(code,max_lenghth):
split_set = get_split_set()
codes= _WORD_SPLIT.split(code)
result = ''
count_length = 0
for c in codes:
if (c != ''):
if (c in split_set):
result += ' '+c+' '
else:
result += c
count_length += 1
if (count_length == max_lenghth):
break
result = " ".join(result.split())
return result
# order AST type
def AST_type_clean(line_dict, need_repeated):
line_code = []
newDict = sorted(line_dict.iteritems(), key=lambda d: d[0])
for key, value in newDict:
if (need_repeated):
remove_duplicated = sorted(str(value).split(' '))
else:
remove_duplicated = sorted(set(str(value).split(' ')))
line_code.append(' '.join(e for e in remove_duplicated))
return ','.join(e for e in line_code)