def data(fileName, fileOut):
f = open(fileName)
w = open(fileOut, 'w')
flag=False
doc=f.readlines()
# pos=int(pos)
for line in doc:
line = line.strip()
if line.startswith(">"):
w.write(line+'|2')
w.write("\n")
else:
w.write(line)
w.write("\n")
f.close()
w.close()
import random
def data1(fileName, fileOut1,fileOut2):
dalist=[]
temp={}
f = open(fileName)
wvalid = open(fileOut1, 'w')
wtrain = open(fileOut2, 'w')
flag=False
doc=f.readlines()
# pos=int(pos)
for i in range(0, 28378, 2):
temp[doc[i].strip('\n')] = doc[i + 1].strip()
dalist.append(temp)
# 置空
temp={}
# print(len(dalist))
random.shuffle(dalist)
# print(len(dalist))
# print(dalist[0])
# print(dalist[1])
# print(dalist[2])
validlist = dalist[0:2838]
trainlist = dalist[2838:]
print(len(validlist))
print(len(trainlist))
# print(validlist)
for mynewlist in validlist:
# print(len(mynewlist))
for key in mynewlist:
# print(key)
# print(mynewlist[key])
wvalid.write(str(key)+'\n')
wvalid.write(mynewlist[key] + '\n')
for mynewlist in trainlist:
for key in mynewlist:
wtrain.write(str(key) + '\n')
wtrain.write(mynewlist[key] + '\n')
# print(type(mynewlist))
f.close()
wvalid.close()
wtrain.close()
data1('PDB_Pronghe.txt','PDB_valid.txt','PDB_train.txt')
python手動拆分數據集
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.