-
Notifications
You must be signed in to change notification settings - Fork 46
/
csv_to_vw.py
55 lines (45 loc) · 2.06 KB
/
csv_to_vw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: UTF-8 -*-
########################################################
# __Author__: Triskelion <info@mlwave.com> #
# Kaggle competition "Display Advertising Challenge": #
# http://www.kaggle.com/c/criteo-display-ad-challenge/ #
# Credit: Zygmunt Zając <zygmunt@fastml.com> #
########################################################
from datetime import datetime
from csv import DictReader
def csv_to_vw(loc_csv, loc_output, train=True):
"""
Munges a CSV file (loc_csv) to a VW file (loc_output). Set "train"
to False when munging a test set.
TODO: Too slow for a daily cron job. Try optimize, Pandas or Go.
"""
start = datetime.now()
print("\nTurning %s into %s. Is_train_set? %s"%(loc_csv,loc_output,train))
with open(loc_output,"wb") as outfile:
for e, row in enumerate( DictReader(open(loc_csv)) ):
#Creating the features
numerical_features = ""
categorical_features = ""
for k,v in row.items():
if k not in ["Label","Id"]:
if "I" in k: # numerical feature, example: I5
if len(str(v)) > 0: #check for empty values
numerical_features += " %s:%s" % (k,v)
if "C" in k: # categorical feature, example: C2
if len(str(v)) > 0:
categorical_features += " %s" % v
#Creating the labels
if train: #we care about labels
if row['Label'] == "1":
label = 1
else:
label = -1 #we set negative label to -1
outfile.write( "%s '%s |i%s |c%s\n" % (label,row['Id'],numerical_features,categorical_features) )
else: #we dont care about labels
outfile.write( "1 '%s |i%s |c%s\n" % (row['Id'],numerical_features,categorical_features) )
#Reporting progress
if e % 1000000 == 0:
print("%s\t%s"%(e, str(datetime.now() - start)))
print("\n %s Task execution time:\n\t%s"%(e, str(datetime.now() - start)))
#csv_to_vw("d:\\Downloads\\train\\train.csv", "c:\\click.train.vw",train=True)
#csv_to_vw("d:\\Downloads\\test\\test.csv", "d:\\click.test.vw",train=False)