|
- # 导入词云制作库wordcloud和中文分词库jieba
- import os
- import re
- from collections import Counter
-
- import jieba
- import wordcloud
-
- r = "[_.!+-=——,$%^,。?、~@#¥%……&*《》<>「」{}【】()()/'']"
-
- # 'animelosummerlivedogdaysorlandobloom'
-
- # 构建并配置词云对象
- # autodl-tmp/wordcloud/msyh.ttc/微软雅黑/MSYH.TTC
- w = wordcloud.WordCloud(width=1000,
- height=700,
- background_color='white',
- font_path='./MSYH.TTC')
- string = []
- data = os.listdir("../../2021年报_text_dir/")
- for data_one in data[:1000]:
- try:
- data_two = os.listdir(os.path.join("../../2021年报_text_dir/", data_one))
- except:
- continue
- # 调用jieba的lcut()方法对原始文本进行中文分词,得到string
- for data_two in data_two:
- txt = open(os.path.join("../../2021年报_text_dir", data_one, data_two), "r",
- encoding="utf-8").read().strip()
- txt = txt.replace("\n", "")
- txt = re.sub("\d", "", txt)
- txt = re.sub(r, " ", txt)
-
- txt_list = jieba.lcut(txt)
- for txt_index in range(len(txt_list)):
- if txt_list[txt_index] == " ":
- continue
- if txt_index < len(txt_list) - 2:
- string.append(txt_list[txt_index] + txt_list[txt_index + 1] + txt_list[txt_index + 2])
- _text_dir_2021 = dict(Counter(string))
- string = []
- data = os.listdir("../../2020年报_text_dir/")
- for data_one in data[:1000]:
- try:
- data_two = os.listdir(os.path.join("../../2020年报_text_dir/", data_one))
- except:
- continue
- # 调用jieba的lcut()方法对原始文本进行中文分词,得到string
- for data_two in data_two:
- txt = open(os.path.join("../../2020年报_text_dir", data_one, data_two), "r",
- encoding="utf-8").read().strip()
- txt = txt.replace("\n", " ")
-
- txt = re.sub("\d", "", txt)
- txt = re.sub(r, " ", txt)
-
- txt_list = jieba.lcut(txt)
- for txt_index in range(len(txt_list)):
- if txt_list[txt_index] == " ":
- continue
- if txt_index < len(txt_list) - 2:
- string.append(txt_list[txt_index] + txt_list[txt_index + 1] + txt_list[txt_index + 2])
- _text_dir_2020 = dict(Counter(string))
- result = {}
- for i, j in _text_dir_2021.items():
- if i in _text_dir_2020:
- result[i] = _text_dir_2020[i] - _text_dir_2021[i]
- print(result)
- w.fit_words(result)
-
- # 将词云图片导出到当前文件夹
- w.to_file('2020-2021中国上市公司企业年报信息tri减量图.png')
|