您只需将行拆分为单词,将它们存储在某个地方,然后在读取所有文件并存储它们的单词后,使用random.sample 选择 100。它就是我在下面的代码中所做的。但是,我不太确定它是否能够处理 170 部小说,因为它可能会导致大量内存使用。
import random
import os
import glob
import sys
import errno
path = '/Users/roelsmeets/Desktop/libris_corpus_clean/*.txt'
files = glob.glob(path)
words = []
for text in files:
try:
with open(text, 'rt', encoding='utf-8') as f:
# number of lines from txt file
for line in f:
for word in line.split():
words.append(word)
except IOError as exc:
# Do not fail if a directory is found, just ignore it.
if exc.errno != errno.EISDIR:
raise
random_sample_input = random.sample(words, 100)
# This block of code writes the result of the previous to a new file
random_sample_output = open("randomsample", "w", encoding='utf-8')
random_sample_input = map(lambda x: x+"\n", random_sample_input)
random_sample_output.writelines(random_sample_input)
random_sample_output.close()
在上面的代码中,小说的单词越多,输出样本中表示的可能性就越大。这可能是也可能不是所需的行为。如果您希望每部小说都具有相同的思考,您可以从其中选择 100 个单词添加到 words 变量中,然后在最后从那里选择 10000 个单词。它还具有使用更少内存的副作用,因为一次只能存储一本小说。
import random
import os
import glob
import sys
import errno
path = '/Users/roelsmeets/Desktop/libris_corpus_clean/*.txt'
files = glob.glob(path)
words = []
for text in files:
try:
novel = []
with open(text, 'rt', encoding='utf-8') as f:
# number of lines from txt file
for line in f:
for word in line.split():
novel.append(word)
words.append(random.sample(novel, 100))
except IOError as exc:
# Do not fail if a directory is found, just ignore it.
if exc.errno != errno.EISDIR:
raise
random_sample_input = random.sample(words, 100)
# This block of code writes the result of the previous to a new file
random_sample_output = open("randomsample", "w", encoding='utf-8')
random_sample_input = map(lambda x: x+"\n", random_sample_input)
random_sample_output.writelines(random_sample_input)
random_sample_output.close()
第三个版本,这个版本将处理句子而不是单词,并保留标点符号。此外,每本书在保留的最后句子上具有相同的“重量”,无论其大小。请记住,句子检测是由一种非常聪明但并非万无一失的算法完成的。
import random
import os
import glob
import sys
import errno
import nltk.data
path = '/home/clement/Documents/randomPythonScripts/data/*.txt'
files = glob.glob(path)
sentence_detector = nltk.data.load('tokenizers/punkt/dutch.pickle')
listOfSentences = []
for text in files:
try:
with open(text, 'rt', encoding='utf-8') as f:
fullText = f.read()
listOfSentences += [x.replace("\n", " ").replace(" "," ").strip() for x in random.sample(sentence_detector.tokenize(fullText), 30)]
except IOError as exc:
# Do not fail if a directory is found, just ignore it.
if exc.errno != errno.EISDIR:
raise
random_sample_input = random.sample(listOfSentences, 15)
print(random_sample_input)
# This block of code writes the result of the previous to a new file
random_sample_output = open("randomsample", "w", encoding='utf-8')
random_sample_input = map(lambda x: x+"\n", random_sample_input)
random_sample_output.writelines(random_sample_input)
random_sample_output.close()