# coding=utf-8
""" """
import time
import re
import os
import sys
import codecs
import shutil
import numpy as np
import matplotlib
import scipy
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
if __name__ == "__main__":
#########################################################################
# TFIDF
#
corpus = []
#
for line in open('01_All_BHSpider_Content_Result.txt', 'r').readlines():
#print line
corpus.append(line.strip())
#print corpus
#: http://blog.csdn.net/abcjennifer/article/details/23615947
#vectorizer = HashingVectorizer(n_features = 4000)
# a[i][j] ji
vectorizer = CountVectorizer()
#tf-idf
transformer = TfidfTransformer()
#fit_transformtf-idf fit_transform
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
#
word = vectorizer.get_feature_names()
#tf-idf,w[i][j]jitf-idf
weight = tfidf.toarray()
#
print 'Features length: ' + str(len(word))
resName = "BHTfidf_Result.txt"
result = codecs.open(resName, 'w', 'utf-8')
for j in range(len(word)):
result.write(word[j] + ' ')
result.write('\r\n\r\n')
#tf-idf,for,for
for i in range(len(weight)):
#print u"-------", i, u"tf-idf------"
for j in range(len(word)):
#print weight[i][j],
result.write(str(weight[i][j]) + ' ')
result.write('\r\n\r\n')
result.close()
########################################################################
# Kmeans
print 'Start Kmeans:'
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=4) #
s = clf.fit(weight)
print s
'''
print 'Start MiniBatchKmeans:'
from sklearn.cluster import MiniBatchKMeans
clf = MiniBatchKMeans(n_clusters=20)
s = clf.fit(weight)
print s
'''
#
print(clf.cluster_centers_)
#
label = [] #1000 4
print(clf.labels_)
i = 1
while i <= len(clf.labels_):
print i, clf.labels_[i-1]
label.append(clf.labels_[i-1])
i = i + 1
#,, 958.137281791
print(clf.inertia_)
########################################################################
#
from sklearn.decomposition import PCA
pca = PCA(n_components=2) #
newData = pca.fit_transform(weight) #N
print newData
#5A
x1 = []
y1 = []
i=0
while i<400:
x1.append(newData[i][0])
y1.append(newData[i][1])
i += 1
#
x2 = []
y2 = []
i = 400
while i<600:
x2.append(newData[i][0])
y2.append(newData[i][1])
i += 1
#
x3 = []
y3 = []
i = 600
while i<800:
x3.append(newData[i][0])
y3.append(newData[i][1])
i += 1
#
x4 = []
y4 = []
i = 800
while i<1000:
x4.append(newData[i][0])
y4.append(newData[i][1])
i += 1
#
plt.plot(x1, y1, 'or')
plt.plot(x2, y2, 'og')
plt.plot(x3, y3, 'ob')
plt.plot(x4, y4, 'ok')
plt.show()