|
- import os
-
- import jieba
- from matplotlib import pyplot, pyplot as plt
- from pylab import mpl
-
- mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 指定默认字体:解决plot不能显示中文问题
- mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
- plt.figure(figsize=(45, 45))
-
- from sklearn.decomposition import PCA
-
-
-
-
-
- from gensim.models.doc2vec import Doc2Vec, TaggedDocument
-
- documents = []
- doc_index = 0
- book_data = os.listdir("../book_data")
- for book_data_one in book_data:
- for line in open(os.path.join("../book_data", book_data_one), encoding="utf-8").read().split("。"):
- documents.append(TaggedDocument(jieba.lcut(line, HMM=False), [doc_index]))
- doc_index += 1
- model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
-
- print(model.wv.index_to_key)
- print(len(model.wv.index_to_key))
- model.save("book_data_doc2vec.npy")
- # print(model.wv.similar_by_word("dme"))
- # fit a 2d PCA model to the vectors
- X = []
- for i in model.wv.index_to_key:
- X.append(model.wv.get_vector(i))
- pca = PCA(n_components=2)
- result = pca.fit_transform(X)
- # create a scatter plot of the projection
- pyplot.scatter(result[:, 0], result[:, 1])
- words = list(model.wv.index_to_key)
- for i, word in enumerate(words):
- pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
- plt.savefig('./book_data-doc2vec.30.jpg', dpi=200, bbox_inches='tight')
-
- pyplot.show()
|