From 4349f2056e6c9ae4e07544d2bf615c4f301e9f0c Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Fri, 25 Nov 2022 08:40:19 +0800 Subject: [PATCH 1/8] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'README.md'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add description --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 95ad3dc..81bbe02 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,3 @@ -# learning_NLP +# team_learning_NLP_ML -动手学习自然语言处理相关的源码分享 \ No newline at end of file +动手学习自然语言处理相关的源码分享(机器学习版本) \ No newline at end of file -- 2.34.1 From 775bab2e78b23157e3883169cf71c687eb295f49 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 09:28:08 +0800 Subject: [PATCH 2/8] add TF-IDF ML version code --- main.ipynb | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 main.ipynb diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..80db2b9 --- /dev/null +++ b/main.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2022-11-25T00:58:46.527043Z", + "iopub.status.busy": "2022-11-25T00:58:46.526188Z", + "iopub.status.idle": "2022-11-25T00:58:47.362414Z", + "shell.execute_reply": "2022-11-25T00:58:47.361231Z", + "shell.execute_reply.started": "2022-11-25T00:58:46.527005Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer # Bags of Words,也称为Count Vectors\r\n", + "from sklearn.feature_extraction.text import TfidfTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2022-11-25T01:07:56.913022Z", + "iopub.status.busy": "2022-11-25T01:07:56.912251Z", + "iopub.status.idle": "2022-11-25T01:07:56.924495Z", + "shell.execute_reply": "2022-11-25T01:07:56.923636Z", + "shell.execute_reply.started": "2022-11-25T01:07:56.912988Z" + }, + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "输出x_train文本向量:\r\n", + "[[0.5 0.5 0. 0.5 0. 0.\r\n", + " 0. 0. 0.5 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0. ]\r\n", + " [0. 0. 0.28428538 0. 0. 0.37380112\r\n", + " 0. 0. 0. 0.37380112 0.37380112 0.\r\n", + " 0.37380112 0.37380112 0.37380112 0. 0.28428538 0. ]\r\n", + " [0. 0. 0.19534855 0. 0.25685987 0.\r\n", + " 0.25685987 0.25685987 0. 0. 0. 0.77057961\r\n", + " 0. 0. 0. 0.25685987 0.19534855 0.25685987]]\r\n", + "输出x_test文本向量:\r\n", + "[[0. 0. 0. 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0. ]\r\n", + " [0. 0. 0. 0.70710678 0. 0.\r\n", + " 0. 0. 0.70710678 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0. ]]\r\n" + ] + } + ], + "source": [ + "#参考博客: https://blog.csdn.net/asialee_bird/article/details/81486700\r\n", + "\r\n", + "x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\r\n", + " '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\r\n", + "x_test=['原始 文本 进行 标记','主要 思想']\r\n", + " \r\n", + "#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频\r\n", + "vectorizer = CountVectorizer()\r\n", + "#该类会统计每个词语的tf-idf权值\r\n", + "tf_idf_transformer = TfidfTransformer()\r\n", + "#将文本转为词频矩阵并计算tf-idf\r\n", + "tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))\r\n", + "#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重\r\n", + "x_train_weight = tf_idf.toarray()\r\n", + " \r\n", + "#对测试集进行tf-idf权重计算\r\n", + "tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))\r\n", + "x_test_weight = tf_idf.toarray() # 测试集TF-IDF权重矩阵\r\n", + " \r\n", + "print('输出x_train文本向量:')\r\n", + "print(x_train_weight)\r\n", + "print('输出x_test文本向量:')\r\n", + "print(x_test_weight)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "py35-paddle1.2.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- 2.34.1 From a668bebc329c5ef181f752bb5b48ff5cfd87e647 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 09:36:54 +0800 Subject: [PATCH 3/8] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'main.ipynb'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/main.ipynb b/main.ipynb index 80db2b9..0bff891 100644 --- a/main.ipynb +++ b/main.ipynb @@ -61,6 +61,7 @@ "source": [ "#参考博客: https://blog.csdn.net/asialee_bird/article/details/81486700\r\n", "\r\n", + "x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\r\n", " '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\r\n", "x_test=['原始 文本 进行 标记','主要 思想']\r\n", -- 2.34.1 From ce3ba207d0ed212526e029c4fabc1aee2bd99955 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 15:05:46 +0800 Subject: [PATCH 4/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=20'main.ipynb'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.ipynb | 119 ----------------------------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 main.ipynb diff --git a/main.ipynb b/main.ipynb deleted file mode 100644 index 0bff891..0000000 --- a/main.ipynb +++ /dev/null @@ -1,119 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2022-11-25T00:58:46.527043Z", - "iopub.status.busy": "2022-11-25T00:58:46.526188Z", - "iopub.status.idle": "2022-11-25T00:58:47.362414Z", - "shell.execute_reply": "2022-11-25T00:58:47.361231Z", - "shell.execute_reply.started": "2022-11-25T00:58:46.527005Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "from sklearn.feature_extraction.text import CountVectorizer # Bags of Words,也称为Count Vectors\r\n", - "from sklearn.feature_extraction.text import TfidfTransformer" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2022-11-25T01:07:56.913022Z", - "iopub.status.busy": "2022-11-25T01:07:56.912251Z", - "iopub.status.idle": "2022-11-25T01:07:56.924495Z", - "shell.execute_reply": "2022-11-25T01:07:56.923636Z", - "shell.execute_reply.started": "2022-11-25T01:07:56.912988Z" - }, - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "输出x_train文本向量:\r\n", - "[[0.5 0.5 0. 0.5 0. 0.\r\n", - " 0. 0. 0.5 0. 0. 0.\r\n", - " 0. 0. 0. 0. 0. 0. ]\r\n", - " [0. 0. 0.28428538 0. 0. 0.37380112\r\n", - " 0. 0. 0. 0.37380112 0.37380112 0.\r\n", - " 0.37380112 0.37380112 0.37380112 0. 0.28428538 0. ]\r\n", - " [0. 0. 0.19534855 0. 0.25685987 0.\r\n", - " 0.25685987 0.25685987 0. 0. 0. 0.77057961\r\n", - " 0. 0. 0. 0.25685987 0.19534855 0.25685987]]\r\n", - "输出x_test文本向量:\r\n", - "[[0. 0. 0. 0. 0. 0.\r\n", - " 0. 0. 0. 0. 0. 0.\r\n", - " 0. 0. 0. 0. 0. 0. ]\r\n", - " [0. 0. 0. 0.70710678 0. 0.\r\n", - " 0. 0. 0.70710678 0. 0. 0.\r\n", - " 0. 0. 0. 0. 0. 0. ]]\r\n" - ] - } - ], - "source": [ - "#参考博客: https://blog.csdn.net/asialee_bird/article/details/81486700\r\n", - "\r\n", - - "x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\r\n", - " '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\r\n", - "x_test=['原始 文本 进行 标记','主要 思想']\r\n", - " \r\n", - "#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频\r\n", - "vectorizer = CountVectorizer()\r\n", - "#该类会统计每个词语的tf-idf权值\r\n", - "tf_idf_transformer = TfidfTransformer()\r\n", - "#将文本转为词频矩阵并计算tf-idf\r\n", - "tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))\r\n", - "#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重\r\n", - "x_train_weight = tf_idf.toarray()\r\n", - " \r\n", - "#对测试集进行tf-idf权重计算\r\n", - "tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))\r\n", - "x_test_weight = tf_idf.toarray() # 测试集TF-IDF权重矩阵\r\n", - " \r\n", - "print('输出x_train文本向量:')\r\n", - "print(x_train_weight)\r\n", - "print('输出x_test文本向量:')\r\n", - "print(x_test_weight)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "py35-paddle1.2.0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} -- 2.34.1 From 9c6baf0279459aa5fd0e91bdbed4f93410c06881 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 15:07:57 +0800 Subject: [PATCH 5/8] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20'Text=20representation?= =?UTF-8?q?'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Text representation | 1 + 1 file changed, 1 insertion(+) create mode 100644 Text representation diff --git a/Text representation b/Text representation new file mode 100644 index 0000000..047d20c --- /dev/null +++ b/Text representation @@ -0,0 +1 @@ +文本表示 -- 2.34.1 From df386a4923ae51194b4a29a7a143f08538d98357 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 15:08:13 +0800 Subject: [PATCH 6/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=20'Text=20representation?= =?UTF-8?q?'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Text representation | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Text representation diff --git a/Text representation b/Text representation deleted file mode 100644 index 047d20c..0000000 --- a/Text representation +++ /dev/null @@ -1 +0,0 @@ -文本表示 -- 2.34.1 From 89cca7ba9024d87585cac34660b19c69e0359b46 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 15:08:32 +0800 Subject: [PATCH 7/8] =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E8=87=B3=20''?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Text_representation.ipynb | 187 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 Text_representation.ipynb diff --git a/Text_representation.ipynb b/Text_representation.ipynb new file mode 100644 index 0000000..2c4fe34 --- /dev/null +++ b/Text_representation.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2022-11-26T07:01:22.955342Z", + "iopub.status.busy": "2022-11-26T07:01:22.955081Z", + "iopub.status.idle": "2022-11-26T07:01:23.974518Z", + "shell.execute_reply": "2022-11-26T07:01:23.973459Z", + "shell.execute_reply.started": "2022-11-26T07:01:22.955319Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer # Bags of Words,也称为Count Vectors\n", + "from sklearn.feature_extraction.text import TfidfTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bags of words" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2022-11-26T07:01:23.990837Z", + "iopub.status.busy": "2022-11-26T07:01:23.990080Z", + "iopub.status.idle": "2022-11-26T07:01:23.997705Z", + "shell.execute_reply": "2022-11-26T07:01:23.996746Z", + "shell.execute_reply.started": "2022-11-26T07:01:23.990805Z" + }, + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0 1 1 1 0 0 1 0 1]\r\n", + " [0 2 0 1 0 1 1 0 1]\r\n", + " [1 0 0 1 1 0 1 1 1]\r\n", + " [0 1 1 1 0 0 1 0 1]]\r\n", + "['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\r\n" + ] + } + ], + "source": [ + "# 参考博客:https://cloud.tencent.com/developer/article/1676848\r\n", + "\r\n", + "from sklearn.feature_extraction.text import CountVectorizer\r\n", + "corpus = [\r\n", + "'This is the first document.',\r\n", + "'This document is the second document.',\r\n", + "'And this is the third one.',\r\n", + "'Is this the first document?',\r\n", + "]\r\n", + "vectorizer = CountVectorizer()\r\n", + "print(vectorizer.fit_transform(corpus).toarray())\r\n", + "\r\n", + "# 参考博客:https://www.cnblogs.com/wuchuanying/p/6231912.html\r\n", + "# print(vectorizer.fit_transform(corpus).todense()) #todense将稀疏矩阵转化为完整特征矩阵\r\n", + "print(sorted(vectorizer.vocabulary_.keys()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TF-IDF" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2022-11-26T07:01:23.976719Z", + "iopub.status.busy": "2022-11-26T07:01:23.975725Z", + "iopub.status.idle": "2022-11-26T07:01:23.988971Z", + "shell.execute_reply": "2022-11-26T07:01:23.988172Z", + "shell.execute_reply.started": "2022-11-26T07:01:23.976686Z" + }, + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "输出x_train文本向量:\r\n", + "[[0.5 0.5 0. 0.5 0. 0.\r\n", + " 0. 0. 0.5 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0. ]\r\n", + " [0. 0. 0.28428538 0. 0. 0.37380112\r\n", + " 0. 0. 0. 0.37380112 0.37380112 0.\r\n", + " 0.37380112 0.37380112 0.37380112 0. 0.28428538 0. ]\r\n", + " [0. 0. 0.19534855 0. 0.25685987 0.\r\n", + " 0.25685987 0.25685987 0. 0. 0. 0.77057961\r\n", + " 0. 0. 0. 0.25685987 0.19534855 0.25685987]]\r\n", + "输出x_test文本向量:\r\n", + "[[0. 0. 0. 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0. ]\r\n", + " [0. 0. 0. 0.70710678 0. 0.\r\n", + " 0. 0. 0.70710678 0. 0. 0.\r\n", + " 0. 0. 0. 0. 0. 0. ]]\r\n" + ] + } + ], + "source": [ + "# 参考博客: https://blog.csdn.net/asialee_bird/article/details/81486700\n", + "\n", + "x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\n", + " '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\n", + "x_test=['原始 文本 进行 标记','主要 思想']\n", + " \n", + "# 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频\n", + "vectorizer = CountVectorizer()\n", + "# 该类会统计每个词语的tf-idf权值\n", + "tf_idf_transformer = TfidfTransformer()\n", + "# 将文本转为词频矩阵并计算tf-idf\n", + "tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))\n", + "# 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重\n", + "x_train_weight = tf_idf.toarray()\n", + " \n", + "#对测试集进行tf-idf权重计算\n", + "tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))\n", + "x_test_weight = tf_idf.toarray() # 测试集TF-IDF权重矩阵\n", + " \n", + "print('输出x_train文本向量:')\n", + "print(x_train_weight)\n", + "print('输出x_test文本向量:')\n", + "print(x_test_weight)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "py35-paddle1.2.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- 2.34.1 From d9ee90bb07469e9d0020a0818df56d75d6668374 Mon Sep 17 00:00:00 2001 From: ZhangbuDong Date: Sat, 26 Nov 2022 15:11:21 +0800 Subject: [PATCH 8/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=20'README.md'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 81bbe02..0000000 --- a/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# team_learning_NLP_ML - -动手学习自然语言处理相关的源码分享(机器学习版本) \ No newline at end of file -- 2.34.1