StackOverflow-Joern/code_clean_utils.py at master · yang1young/StackOverflow-Joern · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding: utf-8 -*-
import re
import codecs
import mysql.connector

SPLIT_CHARS = [',','+','&','!','%','?','_','|',':','-','=','\\','~','*','^','<','>','[',']','$','{','}',';','.','`','@','(',')']
_WORD_SPLIT = re.compile(b"([,+\-&!%'_?|=\s/\*^<>$@\[\](){}#;])")

# all kind of split char
def get_split_set():
    split_set = set()
    for chars in SPLIT_CHARS:
        split_set.add(chars)

# remove all c/c++ comments from code
def remove_cpp_comment(code):
    def blotOutNonNewlines(strIn):  # Return a string containing only the newline chars contained in strIn
        return "" + ("\n" * strIn.count('\n'))

    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):  # Matched string is //...EOL or /*...*/  ==> Blot out all non-newline chars
            return blotOutNonNewlines(s)
        else:  # Matched string is '...' or "..."  ==> Keep unchanged
            return s

    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    return re.sub(pattern, replacer, code)


#remove non ASCII chars
def replace_trash(unicode_string):
    for i in range(0, len(unicode_string)):
        try:
            unicode_string[i].encode("ascii")
        except:
            # means it's non-ASCII
            unicode_string = ""  # replacing it with a single space
    return unicode_string


#remove non ascii code from text
def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

#line count
def line_count(file):
    count = 0
    the_file = open(file, 'rb')
    while True:
        buffer = the_file.read(8192 * 1024)
        if not buffer:
            break
        count += buffer.count('\n')
    the_file.close()
    return count


# remove standard IO and File operation line
def code_clean(text):
    patternBlank = re.compile(' +')
    patternDouble2 = re.compile('\\n\s*\\n')
    patternPrintf = re.compile('.*?print.*\r?\n')
    patternCout = re.compile('.*?cout.*\r?\n')
    patternCin = re.compile('.*?cin.*\r?\n')
    patternScanf = re.compile('.*?scanf.*\r?\n')
    patternTab = re.compile('\t')
    patternReader = re.compile('.*?Reader.*\r?\n')
    patternStream = re.compile('.*?Stream.*\r?\n')
    patternWriter = re.compile('.*?Writer.*\r?\n')

    a = re.sub(patternBlank, " ", text)
    a = re.sub(patternDouble2, "\n", a)
    a = re.sub(patternPrintf, "", a)
    a = re.sub(patternCout, "", a)
    a = re.sub(patternCin, "", a)
    a = re.sub(patternScanf, "", a)
    a = re.sub(patternTab, ' ', a)
    a = re.sub(patternReader, "", a)
    a = re.sub(patternStream, "", a)
    a = re.sub(patternWriter, "", a)

    return re.sub(r'[\xa0\s]+', ' ', a)


# remove muilti-blanks and new lines to single
def remove_blanks(text):
    patternDouble = re.compile('\\n')
    e = re.sub(patternDouble, " ", text)
    patternBlank = re.compile(' +')
    result = re.sub(patternBlank, ' ', e)

    return result


# make code anonymous, such as all number replaced by NUMBER
# all string replaced by STRING, all variable changed to VAR,etc
def code_anonymous(code):
    f = codecs.open('JoernAnalyzeStackOverflowCode/code_info/c_keyWord', 'r', 'utf8')
    lines = f.readlines()
    keyword = set()
    for line in lines:
        keyword.add(line.encode('utf-8').replace('\n', ''))

    # repalce string
    patterString = re.compile("\"(.*?)\"")
    code = re.sub(patterString, "STRING", code)

    # split by slicers
    codes = _WORD_SPLIT.split(code)

    final_code = ''
    for code in codes:
        if ((code == ' ') | (keyword.__contains__(code))):
            final_code += code
        elif (code != ''):
            # replace number
            if (code.isdigit()):
                final_code += 'NUMBER'
            # if the variable or function's name is long, we keep it
            elif (code.__len__() >= 3):
                final_code += code
            # if the variable or function's name is short, we replace it
            elif (code.__len__() < 3):
                final_code += ' VAR '
    return final_code


# make sure every split char is blan
def get_normalize_code(code,max_lenghth):

    split_set = get_split_set()
    codes= _WORD_SPLIT.split(code)
    result = ''
    count_length = 0
    for c in codes:
        if (c != ''):
            if (c in split_set):
                result += ' '+c+' '
            else:
                result += c
            count_length += 1
        if (count_length == max_lenghth):
            break
    result = " ".join(result.split())
    return result


# order AST type
def AST_type_clean(line_dict, need_repeated):
    line_code = []
    newDict = sorted(line_dict.iteritems(), key=lambda d: d[0])

    for key, value in newDict:
        if (need_repeated):
            remove_duplicated = sorted(str(value).split(' '))
        else:
            remove_duplicated = sorted(set(str(value).split(' ')))
        line_code.append(' '.join(e for e in remove_duplicated))

    return ','.join(e for e in line_code)