|
|
@@ -0,0 +1,187 @@ |
|
|
|
{ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 1, |
|
|
|
"metadata": { |
|
|
|
"execution": { |
|
|
|
"iopub.execute_input": "2022-11-26T07:01:22.955342Z", |
|
|
|
"iopub.status.busy": "2022-11-26T07:01:22.955081Z", |
|
|
|
"iopub.status.idle": "2022-11-26T07:01:23.974518Z", |
|
|
|
"shell.execute_reply": "2022-11-26T07:01:23.973459Z", |
|
|
|
"shell.execute_reply.started": "2022-11-26T07:01:22.955319Z" |
|
|
|
}, |
|
|
|
"scrolled": true |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [ |
|
|
|
"from sklearn.feature_extraction.text import CountVectorizer # Bags of Words,也称为Count Vectors\n", |
|
|
|
"from sklearn.feature_extraction.text import TfidfTransformer" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"Bags of words" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 3, |
|
|
|
"metadata": { |
|
|
|
"execution": { |
|
|
|
"iopub.execute_input": "2022-11-26T07:01:23.990837Z", |
|
|
|
"iopub.status.busy": "2022-11-26T07:01:23.990080Z", |
|
|
|
"iopub.status.idle": "2022-11-26T07:01:23.997705Z", |
|
|
|
"shell.execute_reply": "2022-11-26T07:01:23.996746Z", |
|
|
|
"shell.execute_reply.started": "2022-11-26T07:01:23.990805Z" |
|
|
|
}, |
|
|
|
"scrolled": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"[[0 1 1 1 0 0 1 0 1]\r\n", |
|
|
|
" [0 2 0 1 0 1 1 0 1]\r\n", |
|
|
|
" [1 0 0 1 1 0 1 1 1]\r\n", |
|
|
|
" [0 1 1 1 0 0 1 0 1]]\r\n", |
|
|
|
"['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\r\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# 参考博客:https://cloud.tencent.com/developer/article/1676848\r\n", |
|
|
|
"\r\n", |
|
|
|
"from sklearn.feature_extraction.text import CountVectorizer\r\n", |
|
|
|
"corpus = [\r\n", |
|
|
|
"'This is the first document.',\r\n", |
|
|
|
"'This document is the second document.',\r\n", |
|
|
|
"'And this is the third one.',\r\n", |
|
|
|
"'Is this the first document?',\r\n", |
|
|
|
"]\r\n", |
|
|
|
"vectorizer = CountVectorizer()\r\n", |
|
|
|
"print(vectorizer.fit_transform(corpus).toarray())\r\n", |
|
|
|
"\r\n", |
|
|
|
"# 参考博客:https://www.cnblogs.com/wuchuanying/p/6231912.html\r\n", |
|
|
|
"# print(vectorizer.fit_transform(corpus).todense()) #todense将稀疏矩阵转化为完整特征矩阵\r\n", |
|
|
|
"print(sorted(vectorizer.vocabulary_.keys()))" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"TF-IDF" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 2, |
|
|
|
"metadata": { |
|
|
|
"execution": { |
|
|
|
"iopub.execute_input": "2022-11-26T07:01:23.976719Z", |
|
|
|
"iopub.status.busy": "2022-11-26T07:01:23.975725Z", |
|
|
|
"iopub.status.idle": "2022-11-26T07:01:23.988971Z", |
|
|
|
"shell.execute_reply": "2022-11-26T07:01:23.988172Z", |
|
|
|
"shell.execute_reply.started": "2022-11-26T07:01:23.976686Z" |
|
|
|
}, |
|
|
|
"scrolled": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"输出x_train文本向量:\r\n", |
|
|
|
"[[0.5 0.5 0. 0.5 0. 0.\r\n", |
|
|
|
" 0. 0. 0.5 0. 0. 0.\r\n", |
|
|
|
" 0. 0. 0. 0. 0. 0. ]\r\n", |
|
|
|
" [0. 0. 0.28428538 0. 0. 0.37380112\r\n", |
|
|
|
" 0. 0. 0. 0.37380112 0.37380112 0.\r\n", |
|
|
|
" 0.37380112 0.37380112 0.37380112 0. 0.28428538 0. ]\r\n", |
|
|
|
" [0. 0. 0.19534855 0. 0.25685987 0.\r\n", |
|
|
|
" 0.25685987 0.25685987 0. 0. 0. 0.77057961\r\n", |
|
|
|
" 0. 0. 0. 0.25685987 0.19534855 0.25685987]]\r\n", |
|
|
|
"输出x_test文本向量:\r\n", |
|
|
|
"[[0. 0. 0. 0. 0. 0.\r\n", |
|
|
|
" 0. 0. 0. 0. 0. 0.\r\n", |
|
|
|
" 0. 0. 0. 0. 0. 0. ]\r\n", |
|
|
|
" [0. 0. 0. 0.70710678 0. 0.\r\n", |
|
|
|
" 0. 0. 0.70710678 0. 0. 0.\r\n", |
|
|
|
" 0. 0. 0. 0. 0. 0. ]]\r\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# 参考博客: https://blog.csdn.net/asialee_bird/article/details/81486700\n", |
|
|
|
"\n", |
|
|
|
"x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\n", |
|
|
|
" '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\n", |
|
|
|
"x_test=['原始 文本 进行 标记','主要 思想']\n", |
|
|
|
" \n", |
|
|
|
"# 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频\n", |
|
|
|
"vectorizer = CountVectorizer()\n", |
|
|
|
"# 该类会统计每个词语的tf-idf权值\n", |
|
|
|
"tf_idf_transformer = TfidfTransformer()\n", |
|
|
|
"# 将文本转为词频矩阵并计算tf-idf\n", |
|
|
|
"tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))\n", |
|
|
|
"# 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重\n", |
|
|
|
"x_train_weight = tf_idf.toarray()\n", |
|
|
|
" \n", |
|
|
|
"#对测试集进行tf-idf权重计算\n", |
|
|
|
"tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))\n", |
|
|
|
"x_test_weight = tf_idf.toarray() # 测试集TF-IDF权重矩阵\n", |
|
|
|
" \n", |
|
|
|
"print('输出x_train文本向量:')\n", |
|
|
|
"print(x_train_weight)\n", |
|
|
|
"print('输出x_test文本向量:')\n", |
|
|
|
"print(x_test_weight)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"metadata": { |
|
|
|
"scrolled": true, |
|
|
|
"tags": [] |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": null, |
|
|
|
"metadata": { |
|
|
|
"scrolled": true |
|
|
|
}, |
|
|
|
"outputs": [], |
|
|
|
"source": [] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|
"kernelspec": { |
|
|
|
"display_name": "Python 3", |
|
|
|
"language": "python", |
|
|
|
"name": "py35-paddle1.2.0" |
|
|
|
}, |
|
|
|
"language_info": { |
|
|
|
"codemirror_mode": { |
|
|
|
"name": "ipython", |
|
|
|
"version": 3 |
|
|
|
}, |
|
|
|
"file_extension": ".py", |
|
|
|
"mimetype": "text/x-python", |
|
|
|
"name": "python", |
|
|
|
"nbconvert_exporter": "python", |
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
"version": "3.7.4" |
|
|
|
} |
|
|
|
}, |
|
|
|
"nbformat": 4, |
|
|
|
"nbformat_minor": 4 |
|
|
|
} |