-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathutils.py
106 lines (96 loc) · 3.47 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import re
from io import StringIO
import tokenize
def remove_comments_and_docstrings(source, lang):
if lang in ['python']:
"""
Returns 'source' minus comments and docstrings.
"""
io_obj = StringIO(source)
out = ""
prev_toktype = tokenize.INDENT
last_lineno = -1
last_col = 0
for tok in tokenize.generate_tokens(io_obj.readline):
token_type = tok[0]
token_string = tok[1]
start_line, start_col = tok[2]
end_line, end_col = tok[3]
ltext = tok[4]
if start_line > last_lineno:
last_col = 0
if start_col > last_col:
out += (" " * (start_col - last_col))
# Remove comments:
if token_type == tokenize.COMMENT:
pass
# This series of conditionals removes docstrings:
elif token_type == tokenize.STRING:
if prev_toktype != tokenize.INDENT:
# This is likely a docstring; double-check we're not inside an operator:
if prev_toktype != tokenize.NEWLINE:
if start_col > 0:
out += token_string
else:
out += token_string
prev_toktype = token_type
last_col = end_col
last_lineno = end_line
temp = []
for x in out.split('\n'):
if x.strip() != "":
temp.append(x)
return '\n'.join(temp)
elif lang in ['ruby']:
return source
else:
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
temp = []
for x in re.sub(pattern, replacer, source).split('\n'):
if x.strip() != "":
temp.append(x)
return '\n'.join(temp)
def tree_to_token_index(root_node):
if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment':
return [(root_node.start_point, root_node.end_point)]
else:
code_tokens = []
for child in root_node.children:
code_tokens += tree_to_token_index(child)
return code_tokens
def tree_to_variable_index(root_node, index_to_code):
if (len(root_node.children) == 0 or root_node.type == 'string') and root_node.type != 'comment':
index = (root_node.start_point, root_node.end_point)
_, code = index_to_code[index]
if root_node.type != code:
return [(root_node.start_point, root_node.end_point)]
else:
return []
else:
code_tokens = []
for child in root_node.children:
code_tokens += tree_to_variable_index(child, index_to_code)
return code_tokens
def index_to_code_token(index, code):
start_point = index[0]
end_point = index[1]
if start_point[0] == end_point[0]:
s = code[start_point[0]][start_point[1]:end_point[1]]
else:
s = ""
s += code[start_point[0]][start_point[1]:]
for i in range(start_point[0] + 1, end_point[0]):
s += code[i]
s += code[end_point[0]][:end_point[1]]
return s