# coding=utf-8
import os
import jieba
import sys
import re
import time
import jieba.posseg as pseg
sys.path.append("../")
jieba.load_userdict("../Python27/fenci/dict.txt") # 加載自定義分詞詞典
'''
title:利用結巴分詞進行文本語料處理:單文本處理器、批量文件處理器
1 首先對文本進行遍歷查找
2 創建原始文本的保存結構
3 對原文本進行結巴分詞和停用詞處理
4 對預處理結果進行標準化格式,並保存原文件結構路徑
author:白寧超
myblog:http://www.cnblogs.com/baiboy/
time:2017年4月28日10:03:09
'''
'''
分詞.詞性標註以及去停用詞
stopwordspath: 停用詞路徑
dealpath:中文數據預處理文件的路徑
savepath:中文數據預處理結果的保存路徑
'''
"""
def cutTxtWord(dealpath, savepath, stopwordspath):
stopwords = {}.fromkeys([line.rstrip() for line in open(stopwordspath, "r", encoding='utf-8')]) # 停用詞表
with open(dealpath, "r", encoding='utf-8') as f:
txtlist = f.read() # 讀取待處理的文本
words = pseg.cut(txtlist) # 帶詞性標註的分詞結果
cutresult = "" # 獲取去除停用詞後的分詞結果
for word, flag in words:
if word not in stopwords:
cutresult += word + "/" + flag + " " # 去停用詞
getFlag(cutresult, savepath) #
"""
'''
分詞.詞性標註以及去停用詞
stopwordspath: 停用詞路徑
read_folder_path :中文數據預處理文件的路徑
write_folder_path :中文數據預處理結果的保存路徑
filescount=300 #設置文件夾下文件最多多少個
'''
def cutFileWord(read_folder_path, write_folder_path, stopwordspath):
# 停用詞表
stopwords = {}.fromkeys([line.rstrip() for line in open(stopwordspath, "r", encoding='utf-8')])
# 獲取待處理根目錄下的所有類別
folder_list = os.listdir(read_folder_path)
# 類間循環
for folder in folder_list:
# 某類下的路徑
new_folder_path = os.path.join(read_folder_path, folder)
# 創建保存文件目錄
path = write_folder_path + folder # 保存文件的子文件
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
print(path + ' 創建成功')
else:
pass
save_folder_path = os.path.join(write_folder_path, folder) # 某類下的保存路徑
print('--> 請稍等,正在處理中...')
# 類內循環
files = os.listdir(new_folder_path)
j = 1
for file in files:
if j > len(files): break
dealpath = os.path.join(new_folder_path, file) # 處理單個文件的路徑
with open(dealpath, "r", encoding='utf-8') as f:
txtlist = f.read()
# python 過濾中文、英文標點特殊符號
# txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",txtlist)
words = pseg.cut(txtlist) # 帶詞性標註的分詞結果
cutresult = "" # 單個文本:分詞後經停用詞處理後的結果
for word, flag in words:
if word not in stopwords:
cutresult += word + "/" + flag + " " # 去停用詞
savepath = os.path.join(save_folder_path, file)
getFlag(cutresult, savepath)
j += 1
'''
做詞性篩選
cutresult:str類型,初切分的結果
savepath: 保存文件路徑
'''
def getFlag(cutresult, savepath):
txtlist = [] # 過濾掉的詞性後的結果
# 詞列表爲自己定義要過濾掉的詞性
cixing = ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz", "/y"]
for line in cutresult.split('\n'):
line_list2 = re.split('[ ]', line)
line_list2.append("\n") # 保持原段落格式存在
line_list = line_list2[:]
for segs in line_list2:
for K in cixing:
if K in segs:
line_list.remove(segs)
break
else:
pass
txtlist.extend(line_list)
# 去除詞性標籤
resultlist = txtlist[:]
flagresult = ""
for v in txtlist:
if "/" in v:
slope = v.index("/")
letter = v[0:slope] + " "
flagresult += letter
else:
flagresult += v
standdata(flagresult, savepath)
'''
標準化處理,去除空行,空白字符等。
flagresult:篩選過的結果
'''
def standdata(flagresult, savepath):
f2 = open(savepath, "w", encoding='utf-8')
for line in flagresult.split('\n'):
if len(line) >= 2:
line_clean = "/ ".join(line.split())
lines = line_clean + " " + "\n"
f2.write(lines)
else:
pass
f2.close()
if __name__ == '__main__':
t1 = time.time()
# 測試單個文件
# dealpath = "../Database/SogouC/FileTest/1.txt"
# savepath = "../Database/SogouCCut/FileTest/1.txt"
# stopwordspath = '../Database/stopwords/CH_stopWords.txt'
stopwordspath1 = '../Python27/fenci/chstop.txt' # 哈工大停用詞表
# 批量處理文件夾下的文件
# rfolder_path = '../Database/SogouC/Sample/'
rfolder_path = '../Python27/fenci/FileNews/'
# 分詞處理後保存根路徑
wfolder_path = '../Python27/fenci/result/'
# 中文語料預處理器
# cutTxtWord(dealpath,savepath,stopwordspath) # 單文本預處理器
cutFileWord(rfolder_path, wfolder_path, stopwordspath) # 多文本預處理器
t2 = time.time()
print("中文語料語處理完成,耗時:" + str(t2 - t1) + "秒。") # 反饋結果
運行結果如下
Building prefix dict from the default dictionary ...
Loading model from cache c:\users\hp\appdata\local\temp\jieba.cache
Loading model cost 0.478 seconds.
Prefix dict has been built succesfully.
Traceback (most recent call last):
File "D:/Python27/fenci/fenci4.py", line 10, in <module>
jieba.load_userdict("../Python27/fenci/dict.txt") # 加載自定義分詞詞典
File "D:\Python27\lib\site-packages\jieba\__init__.py", line 374, in load_userdict
f = open(f, 'rb')
IOError: [Errno 2] No such file or directory: '../Python27/fenci/dict.txt'
Process finished with exit code 1