python手動拆分數據集

def data(fileName, fileOut):
    f = open(fileName)
    w = open(fileOut, 'w')
    flag=False
    doc=f.readlines()
    # pos=int(pos)


    for line in doc:
        line = line.strip()
        if line.startswith(">"):
            w.write(line+'|2')
            w.write("\n")


        else:
            w.write(line)
            w.write("\n")


    f.close()
    w.close()
import random
def data1(fileName, fileOut1,fileOut2):
    dalist=[]
    temp={}

    f = open(fileName)
    wvalid = open(fileOut1, 'w')
    wtrain = open(fileOut2, 'w')
    flag=False
    doc=f.readlines()
    # pos=int(pos)
    for i in range(0, 28378, 2):
        temp[doc[i].strip('\n')] = doc[i + 1].strip()
        dalist.append(temp)
        # 置空
        temp={}
    # print(len(dalist))
    random.shuffle(dalist)
    # print(len(dalist))
    # print(dalist[0])
    # print(dalist[1])
    # print(dalist[2])

    validlist = dalist[0:2838]
    trainlist = dalist[2838:]
    print(len(validlist))
    print(len(trainlist))
    # print(validlist)

    for mynewlist in validlist:
        # print(len(mynewlist))
        for key in mynewlist:
            # print(key)
            # print(mynewlist[key])
            wvalid.write(str(key)+'\n')
            wvalid.write(mynewlist[key] + '\n')

    for mynewlist in trainlist:
        for key in mynewlist:
            wtrain.write(str(key) + '\n')
            wtrain.write(mynewlist[key] + '\n')
        # print(type(mynewlist))

    f.close()
    wvalid.close()
    wtrain.close()

data1('PDB_Pronghe.txt','PDB_valid.txt','PDB_train.txt')

發佈了8 篇原創文章 · 獲贊 1 · 訪問量 1859
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章