win10上用Python2.7處理文本,出錯IOError: [Errno 2] No such file or directory:如何解決???

# coding=utf-8
import os
import jieba
import sys
import re
import time
import jieba.posseg as pseg

sys.path.append("../")
jieba.load_userdict("../Python27/fenci/dict.txt")  # 加載自定義分詞詞典

'''
title:利用結巴分詞進行文本語料處理:單文本處理器、批量文件處理器
    1 首先對文本進行遍歷查找
    2 創建原始文本的保存結構
    3 對原文本進行結巴分詞和停用詞處理
    4 對預處理結果進行標準化格式,並保存原文件結構路徑
author:白寧超
myblog:http://www.cnblogs.com/baiboy/
time:2017年4月28日10:03:09
'''

'''
分詞.詞性標註以及去停用詞
stopwordspath: 停用詞路徑
dealpath:中文數據預處理文件的路徑
savepath:中文數據預處理結果的保存路徑
'''

"""
def cutTxtWord(dealpath, savepath, stopwordspath):
    stopwords = {}.fromkeys([line.rstrip() for line in open(stopwordspath, "r", encoding='utf-8')])  # 停用詞表
    with open(dealpath, "r", encoding='utf-8') as f:
        txtlist = f.read()  # 讀取待處理的文本
    words = pseg.cut(txtlist)  # 帶詞性標註的分詞結果
    cutresult = ""  # 獲取去除停用詞後的分詞結果
    for word, flag in words:
        if word not in stopwords:
            cutresult += word + "/" + flag + " "  # 去停用詞
            getFlag(cutresult, savepath)  #
"""

'''
分詞.詞性標註以及去停用詞
stopwordspath: 停用詞路徑
read_folder_path :中文數據預處理文件的路徑
write_folder_path :中文數據預處理結果的保存路徑
filescount=300 #設置文件夾下文件最多多少個
'''


def cutFileWord(read_folder_path, write_folder_path, stopwordspath):
    # 停用詞表
    stopwords = {}.fromkeys([line.rstrip() for line in open(stopwordspath, "r", encoding='utf-8')])

    # 獲取待處理根目錄下的所有類別
    folder_list = os.listdir(read_folder_path)
    # 類間循環
    for folder in folder_list:
        # 某類下的路徑
        new_folder_path = os.path.join(read_folder_path, folder)

        # 創建保存文件目錄
        path = write_folder_path + folder  # 保存文件的子文件
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
            print(path + ' 創建成功')
        else:
            pass
        save_folder_path = os.path.join(write_folder_path, folder)  # 某類下的保存路徑
        print('--> 請稍等,正在處理中...')

        # 類內循環
        files = os.listdir(new_folder_path)
        j = 1
        for file in files:
            if j > len(files): break
            dealpath = os.path.join(new_folder_path, file)  # 處理單個文件的路徑
            with open(dealpath, "r", encoding='utf-8') as f:
                txtlist = f.read()
                # python 過濾中文、英文標點特殊符號
                # txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",txtlist)
            words = pseg.cut(txtlist)  # 帶詞性標註的分詞結果
            cutresult = ""  # 單個文本:分詞後經停用詞處理後的結果
            for word, flag in words:
                if word not in stopwords:
                    cutresult += word + "/" + flag + " "  # 去停用詞
            savepath = os.path.join(save_folder_path, file)
            getFlag(cutresult, savepath)
            j += 1


'''
做詞性篩選
cutresult:str類型,初切分的結果
savepath: 保存文件路徑
'''


def getFlag(cutresult, savepath):
    txtlist = []  # 過濾掉的詞性後的結果
    # 詞列表爲自己定義要過濾掉的詞性
    cixing = ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz", "/y"]
    for line in cutresult.split('\n'):
        line_list2 = re.split('[ ]', line)
        line_list2.append("\n")  # 保持原段落格式存在
        line_list = line_list2[:]
        for segs in line_list2:
            for K in cixing:
                if K in segs:
                    line_list.remove(segs)
                    break
                else:
                    pass
        txtlist.extend(line_list)

    # 去除詞性標籤
    resultlist = txtlist[:]
    flagresult = ""
    for v in txtlist:
        if "/" in v:
            slope = v.index("/")
            letter = v[0:slope] + " "
            flagresult += letter
        else:
            flagresult += v
    standdata(flagresult, savepath)


'''
標準化處理,去除空行,空白字符等。
flagresult:篩選過的結果
'''


def standdata(flagresult, savepath):
    f2 = open(savepath, "w", encoding='utf-8')
    for line in flagresult.split('\n'):
        if len(line) >= 2:
            line_clean = "/ ".join(line.split())
            lines = line_clean + " " + "\n"
            f2.write(lines)
        else:
            pass
    f2.close()


if __name__ == '__main__':
    t1 = time.time()

    # 測試單個文件
    # dealpath = "../Database/SogouC/FileTest/1.txt"
    # savepath = "../Database/SogouCCut/FileTest/1.txt"

    # stopwordspath = '../Database/stopwords/CH_stopWords.txt'
    stopwordspath1 = '../Python27/fenci/chstop.txt'  # 哈工大停用詞表

    # 批量處理文件夾下的文件
    # rfolder_path = '../Database/SogouC/Sample/'
    rfolder_path = '../Python27/fenci/FileNews/'
    # 分詞處理後保存根路徑
    wfolder_path = '../Python27/fenci/result/'

    # 中文語料預處理器
    # cutTxtWord(dealpath,savepath,stopwordspath) # 單文本預處理器
    cutFileWord(rfolder_path, wfolder_path, stopwordspath)  # 多文本預處理器

    t2 = time.time()
    print("中文語料語處理完成,耗時:" + str(t2 - t1) + "秒。")  # 反饋結果


運行結果如下
Building prefix dict from the default dictionary ...
Loading model from cache c:\users\hp\appdata\local\temp\jieba.cache
Loading model cost 0.478 seconds.
Prefix dict has been built succesfully.
Traceback (most recent call last):
  File "D:/Python27/fenci/fenci4.py", line 10, in <module>
    jieba.load_userdict("../Python27/fenci/dict.txt")  # 加載自定義分詞詞典
  File "D:\Python27\lib\site-packages\jieba\__init__.py", line 374, in load_userdict
    f = open(f, 'rb')
IOError: [Errno 2] No such file or directory: '../Python27/fenci/dict.txt'


Process finished with exit code 1



發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章