#3 添加'Text_representation'模块

Closed
ZhangbuDong wants to merge 9 commits from :master into master
  1. +0
    -3
      README.md
  2. +187
    -0
      Text_representation.ipynb

+ 0
- 3
README.md View File

@@ -1,3 +0,0 @@
# learning_NLP

动手学习自然语言处理相关的源码分享

+ 187
- 0
Text_representation.ipynb View File

@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2022-11-26T07:01:22.955342Z",
"iopub.status.busy": "2022-11-26T07:01:22.955081Z",
"iopub.status.idle": "2022-11-26T07:01:23.974518Z",
"shell.execute_reply": "2022-11-26T07:01:23.973459Z",
"shell.execute_reply.started": "2022-11-26T07:01:22.955319Z"
},
"scrolled": true
},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer # Bags of Words,也称为Count Vectors\n",
"from sklearn.feature_extraction.text import TfidfTransformer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Bags of words"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2022-11-26T07:01:23.990837Z",
"iopub.status.busy": "2022-11-26T07:01:23.990080Z",
"iopub.status.idle": "2022-11-26T07:01:23.997705Z",
"shell.execute_reply": "2022-11-26T07:01:23.996746Z",
"shell.execute_reply.started": "2022-11-26T07:01:23.990805Z"
},
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1 1 1 0 0 1 0 1]\r\n",
" [0 2 0 1 0 1 1 0 1]\r\n",
" [1 0 0 1 1 0 1 1 1]\r\n",
" [0 1 1 1 0 0 1 0 1]]\r\n",
"['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\r\n"
]
}
],
"source": [
"# 参考博客:https://cloud.tencent.com/developer/article/1676848\r\n",
"\r\n",
"from sklearn.feature_extraction.text import CountVectorizer\r\n",
"corpus = [\r\n",
"'This is the first document.',\r\n",
"'This document is the second document.',\r\n",
"'And this is the third one.',\r\n",
"'Is this the first document?',\r\n",
"]\r\n",
"vectorizer = CountVectorizer()\r\n",
"print(vectorizer.fit_transform(corpus).toarray())\r\n",
"\r\n",
"# 参考博客:https://www.cnblogs.com/wuchuanying/p/6231912.html\r\n",
"# print(vectorizer.fit_transform(corpus).todense()) #todense将稀疏矩阵转化为完整特征矩阵\r\n",
"print(sorted(vectorizer.vocabulary_.keys()))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TF-IDF"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2022-11-26T07:01:23.976719Z",
"iopub.status.busy": "2022-11-26T07:01:23.975725Z",
"iopub.status.idle": "2022-11-26T07:01:23.988971Z",
"shell.execute_reply": "2022-11-26T07:01:23.988172Z",
"shell.execute_reply.started": "2022-11-26T07:01:23.976686Z"
},
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"输出x_train文本向量:\r\n",
"[[0.5 0.5 0. 0.5 0. 0.\r\n",
" 0. 0. 0.5 0. 0. 0.\r\n",
" 0. 0. 0. 0. 0. 0. ]\r\n",
" [0. 0. 0.28428538 0. 0. 0.37380112\r\n",
" 0. 0. 0. 0.37380112 0.37380112 0.\r\n",
" 0.37380112 0.37380112 0.37380112 0. 0.28428538 0. ]\r\n",
" [0. 0. 0.19534855 0. 0.25685987 0.\r\n",
" 0.25685987 0.25685987 0. 0. 0. 0.77057961\r\n",
" 0. 0. 0. 0.25685987 0.19534855 0.25685987]]\r\n",
"输出x_test文本向量:\r\n",
"[[0. 0. 0. 0. 0. 0.\r\n",
" 0. 0. 0. 0. 0. 0.\r\n",
" 0. 0. 0. 0. 0. 0. ]\r\n",
" [0. 0. 0. 0.70710678 0. 0.\r\n",
" 0. 0. 0.70710678 0. 0. 0.\r\n",
" 0. 0. 0. 0. 0. 0. ]]\r\n"
]
}
],
"source": [
"# 参考博客: https://blog.csdn.net/asialee_bird/article/details/81486700\n",
"\n",
"x_train = ['TF-IDF 主要 思想 是','算法 一个 重要 特点 可以 脱离 语料库 背景',\n",
" '如果 一个 网页 被 很多 其他 网页 链接 说明 网页 重要']\n",
"x_test=['原始 文本 进行 标记','主要 思想']\n",
" \n",
"# 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频\n",
"vectorizer = CountVectorizer()\n",
"# 该类会统计每个词语的tf-idf权值\n",
"tf_idf_transformer = TfidfTransformer()\n",
"# 将文本转为词频矩阵并计算tf-idf\n",
"tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))\n",
"# 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重\n",
"x_train_weight = tf_idf.toarray()\n",
" \n",
"#对测试集进行tf-idf权重计算\n",
"tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))\n",
"x_test_weight = tf_idf.toarray() # 测试集TF-IDF权重矩阵\n",
" \n",
"print('输出x_train文本向量:')\n",
"print(x_train_weight)\n",
"print('输出x_test文本向量:')\n",
"print(x_test_weight)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "py35-paddle1.2.0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Loading…
Cancel
Save