Révision | 2c007f732b7bab7aa98c765d88647b0014c2bdcf |
---|---|
Taille | 940 octets |
l'heure | 2015-03-26 01:36:49 |
Auteur | Lorenzo Isella |
Message de Log | A simple script to convert the test and train datasets (without the target values!) to a numerical matrix based on the term frequency–inverse document frequency. |
#! /usr/bin/env python
import pandas as pd
import numpy as np
from sklearn import ensemble, feature_extraction, preprocessing
# import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#sample = pd.read_csv('sampleSubmission.csv')
# drop ids and get labels
labels = train.target.values
#labels2=np.copy(labels)
train = train.drop('id', axis=1)
train = train.drop('target', axis=1)
test = test.drop('id', axis=1)
# transform counts to TFIDF features
tfidf = feature_extraction.text.TfidfTransformer()
train = tfidf.fit_transform(train).toarray()
test = tfidf.transform(test).toarray()
#labels=labels.reshape(-1,1)
# train=np.hstack((train,labels))
# train=pd.DataFrame(train)
# test=pd.DataFrame(test)
np.savetxt("train-tfidf.csv", train, delimiter=",")
np.savetxt("test-tfidf.csv", test, delimiter=",")
# train.to_csv("train-tfidf.csv", train)
# test.to_csv("test-tfidf.csv", test)
print "So far so good"