-
Feature: opcode, API
-
Detection Model: DNN(Pytorch), LightGBM, Random Forest
-
Accuracy: 90~93%
- 하지만 예선에서 검증할땐 81%... overfitting의 위험성을 알게되었다
- Feature의 중요성
- Jaeyung's Tistory
start = time.time()
for index, i in enumerate(filelist):
print(index, i)
api_dict = defaultdict(lambda: 0)
try:
pe = pefile.PE(data_dir+"/"+i)
pe.parse_data_directories()
#record every file's API list
try:
for entry in pe.DIRECTORY_ENTRY_IMPORT:
for imp in entry.imports:
try:
api_name = imp.name.decode("utf-8")
if api_name in api_list:
api_dict[api_name] = 1
except:
pass
except:
api_dict = defaultdict(lambda: -1)
except:
api_dict = defaultdict(lambda: -1)
get_api[i] = api_dict
end = time.time()
start = time.time()
for count, i in enumerate(file_list):
if count % 100 == 0:
print(count)
command = "objdump -d "+data_dir+"/" + i + "|grep '[0-9a-f]:'|grep -v 'file'|cut -f2 -d:|cut -f1-6 -d' '|tr -s ' '|tr '\t' ' '|sed 's/ $//g'|paste -d '' -s |sed 's/^/\"/'|sed 's/$/\"/g'"
return_v = subprocess.check_output(command, shell=True).decode("utf-8")
result = return_v.split()
#for count opcodes
opcodes = dict(Counter(result))
for v in dict(Counter(result)).keys():
if(len(v)<2 or len(v)>2):
del opcodes[v]
continue
if not v in opcodes_list:
del opcodes[v]
continue
#for IDF
idf_data[v] += 1
all_data[i] = opcodes
end = time.time()