Читайте также: |
|
from __future__ import division
import sys
import csv as csv
import numpy as np
import pandas as pd
from pandas import DataFrame
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import cross_validation, svm, tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
import random
import warnings
from datetime import datetime
from sklearn.grid_search import GridSearchCV
warnings.filterwarnings("ignore")
selClassifiers = {
'linear': LinearSVC(),
'linearWithSGD': SGDClassifier(),
'rbf': SVC(kernel='rbf', probability=True),
'poly': SVC(kernel='poly', probability=True),
'sigmoid': SVC(kernel='sigmoid', probability=True),
'bayes': MultinomialNB()
}
classifierDescriptions = {
'linearWithSGD': 'linear SVM with SGD training',
'linear': 'linear SVM without SGD training',
'rbf': 'SVM with RBF kernel',
'poly': 'SVM with polynomial kernel',
'sigmoid': 'SVM with sigmoid kernel',
'bayes': 'Naive Bayes classifier'
}
def replacer(text):
return str(str(text).replace("u'",'').replace("'", ''))
def workMode(fileIn, toPredict, fileOut, classif):
work = pd.read_csv(fileIn, header = 0, encoding='utf-8-sig')
work_test = pd.read_csv(toPredict, header = 0, encoding='utf-8-sig')
X_train = []
y_train = []
X_test = []
for i in work[[i for i in list(work.columns.values) if i.startswith('Change')]].values:
X_train.append(','.join(i.T.tolist()))
X_train = np.array(X_train)
for i in work[[i for i in list(work.columns.values) if i.startswith('Corax')]].values:
y_train.append(list(i))
for i in work_test[[i for i in list(work_test.columns.values) if i.startswith('Change')]].values:
X_test.append(','.join(i.T.tolist()))
X_test = np.array(X_test)
lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y_train)
print ("Getting results of classifier")
classifier = Pipeline([('vectorizer', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(selClassifiers[classif]))])
classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = lb.inverse_transform(predicted)
df = DataFrame.from_items([('Change', X_test), ('Prediction',all_labels)])
df.Prediction = df.Prediction.map(replacer)
df.to_csv(fileOut)
def testMode(fileIn, fileOut, classif):
df = pd.read_csv(fileIn, header = 0, encoding='utf-8-sig')
rows = random.sample(list(df.index), int(len(df) * 0.9))
work = df.ix[rows]
work_test = df.drop(rows)
X_train = []
y_train = []
X_test = []
y_test = []
for i in work[[i for i in list(work.columns.values) if i.startswith('Change')]].values:
X_train.append(','.join(i.T.tolist()))
X_train = np.array(X_train)
for i in work[[i for i in list(work.columns.values) if i.startswith('Corax')]].values:
y_train.append(list(i))
for i in work_test[[i for i in list(work_test.columns.values) if i.startswith('Change')]].values:
X_test.append(','.join(i.T.tolist()))
X_test = np.array(X_test)
for i in work_test[[i for i in list(work_test.columns.values) if i.startswith('Corax')]].values:
y_test.append(list(i))
lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y_train)
print ("Getting results of %s" % classifierDescriptions[classif])
classifier = Pipeline([('vectorizer', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(selClassifiers[classif]))])
classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = lb.inverse_transform(predicted)
df = DataFrame.from_items([('Test', X_test), ('RealAnswer', y_test), ('Prediction',all_labels)])
CorPred = 0
Total = 0
for classifying, item, labels in zip(X_test, y_test, all_labels):
for res in labels:
if res in item:
CorPred+=1
Total+=len(labels)
print('Predicted correctly %s labels out of %s labels' % (CorPred, Total))
print('Precision is %.2f %%' % (100*float(CorPred)/float(Total)))
df.Prediction = df.Prediction.map(replacer)
df.RealAnswer = df.RealAnswer.map(replacer)
df.to_csv(fileOut)
def main():
start = datetime.now()
print("Program started at %s" % start)
if sys.argv[1] == 'test':
testMode(sys.argv[2], sys.argv[3], sys.argv[4])
elif sys.argv[1] == 'work':
workMode(sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
else:
print('Unknown mode, only test or work modes are available')
end = datetime.now()
print("Program finished at %s" % end)
print("It took %s seconds for program to complete" % (end - start).total_seconds())
if __name__ == '__main__':
main()
Дата добавления: 2015-11-16; просмотров: 45 | Нарушение авторских прав
<== предыдущая страница | | | следующая страница ==> |
Приложение 2. Файл конфигурации системы предобработки информации | | | TRANSLATION STUDIES |