|
- # 导入词云制作库wordcloud和中文分词库jieba
- import os
- import re
- from collections import Counter
-
- import jieba
- import wordcloud
-
- # 构建并配置词云对象
- # autodl-tmp/wordcloud/msyh.ttc/微软雅黑/MSYH.TTC
- w = wordcloud.WordCloud(width=1000,
- height=700,
- background_color='white',
- font_path='./MSYH.TTC')
- string = []
- r = "[_.!+-=——,$%^,,。?、~@#¥%……&*[]《》<>「」{}【】()()/'']"
-
- data = os.listdir("../../2021年报_text_dir/")
- for data_one in data[:250]:
- try:
- data_two = os.listdir(os.path.join("../../2021年报_text_dir/", data_one))
- except:
- continue
- # 调用jieba的lcut()方法对原始文本进行中文分词,得到string
- for data_two in data_two:
- txt = open(os.path.join("../../2021年报_text_dir", data_one, data_two), "r",
- encoding="utf-8").read().strip()
- txt = txt.replace("\n", "")
- txt = re.sub("\d", "", txt)
- txt = re.sub(r, " ", txt)
-
- txt_list = jieba.lcut(txt, use_paddle=True)
- for txt_index in range(len(txt_list)):
- if txt_list[txt_index] == " ":
- continue
- if txt_index < len(txt_list):
- string.append(txt_list[txt_index])
- _text_dir_2021 = dict(Counter(string))
- string = []
- data = os.listdir("../../2020年报_text_dir/")
- for data_one in data[:250]:
- try:
- data_two = os.listdir(os.path.join("../../2020年报_text_dir/", data_one))
- except:
- continue
- # 调用jieba的lcut()方法对原始文本进行中文分词,得到string
- for data_two in data_two:
- txt = open(os.path.join("../../2020年报_text_dir", data_one, data_two), "r",
- encoding="utf-8").read().strip()
- txt = txt.replace("\n", "")
- txt = re.sub("\d", "", txt)
- txt = re.sub(r, " ", txt)
- txt_list = jieba.lcut(txt, use_paddle=True)
- for txt_index in range(len(txt_list)):
- if txt_list[txt_index] == " ":
- continue
- if txt_index < len(txt_list):
- string.append(txt_list[txt_index])
- _text_dir_2020 = dict(Counter(string))
- result = {}
- for i, j in _text_dir_2021.items():
- if i in _text_dir_2020:
- if _text_dir_2021[i] - _text_dir_2020[i] < 100:
- result[i] = _text_dir_2021[i] - _text_dir_2020[i]
- print(result)
- w.fit_words(result)
-
- # 将词云图片导出到当前文件夹
- w.to_file('2020-2021中国上市公司企业年报信息增量图_wo_top100_lac.png')
|