【发布时间】:2011-02-24 07:13:27
【问题描述】:
我正在尝试学习 Haskell,在 reddit 上一篇关于 Markov 文本链的文章之后,我决定先在 Python 中实现 Markov 文本生成,现在在 Haskell 中实现。但是我注意到我的 python 实现比 Haskell 版本快得多,甚至 Haskell 也被编译为本机代码。我想知道我应该怎么做才能使 Haskell 代码运行得更快,现在我相信它会因为使用 Data.Map 而不是 hashmap 而慢得多,但我不确定
我将发布 Python 代码和 Haskell。使用相同的数据,Python 大约需要 3 秒,而 Haskell 则接近 16 秒。
毫无疑问,我会接受任何建设性的批评:)。
import random
import re
import cPickle
class Markov:
def __init__(self, filenames):
self.filenames = filenames
self.cache = self.train(self.readfiles())
picklefd = open("dump", "w")
cPickle.dump(self.cache, picklefd)
picklefd.close()
def train(self, text):
splitted = re.findall(r"(\w+|[.!?',])", text)
print "Total of %d splitted words" % (len(splitted))
cache = {}
for i in xrange(len(splitted)-2):
pair = (splitted[i], splitted[i+1])
followup = splitted[i+2]
if pair in cache:
if followup not in cache[pair]:
cache[pair][followup] = 1
else:
cache[pair][followup] += 1
else:
cache[pair] = {followup: 1}
return cache
def readfiles(self):
data = ""
for filename in self.filenames:
fd = open(filename)
data += fd.read()
fd.close()
return data
def concat(self, words):
sentence = ""
for word in words:
if word in "'\",?!:;.":
sentence = sentence[0:-1] + word + " "
else:
sentence += word + " "
return sentence
def pickword(self, words):
temp = [(k, words[k]) for k in words]
results = []
for (word, n) in temp:
results.append(word)
if n > 1:
for i in xrange(n-1):
results.append(word)
return random.choice(results)
def gentext(self, words):
allwords = [k for k in self.cache]
(first, second) = random.choice(filter(lambda (a,b): a.istitle(), [k for k in self.cache]))
sentence = [first, second]
while len(sentence) < words or sentence[-1] is not ".":
current = (sentence[-2], sentence[-1])
if current in self.cache:
followup = self.pickword(self.cache[current])
sentence.append(followup)
else:
print "Wasn't able to. Breaking"
break
print self.concat(sentence)
Markov(["76.txt"])
--
module Markov
( train
, fox
) where
import Debug.Trace
import qualified Data.Map as M
import qualified System.Random as R
import qualified Data.ByteString.Char8 as B
type Database = M.Map (B.ByteString, B.ByteString) (M.Map B.ByteString Int)
train :: [B.ByteString] -> Database
train (x:y:[]) = M.empty
train (x:y:z:xs) =
let l = train (y:z:xs)
in M.insertWith' (\new old -> M.insertWith' (+) z 1 old) (x, y) (M.singleton z 1) `seq` l
main = do
contents <- B.readFile "76.txt"
print $ train $ B.words contents
fox="The quick brown fox jumps over the brown fox who is slow jumps over the brown fox who is dead."
【问题讨论】:
-
有意思,也在寻找答案。 16 秒与 3 秒的差别真的很大。
-
顺便说一句,Python 代码的缩进似乎被破坏了......
-
我不认为你的 Haskell 代码能完成你想要的。如果您检查输出,您将看到
M.Map String Int映射中没有大于 2 的值。你是说n + o还是o + 1而不是n + 1? -
@Travis 你是绝对正确的,但它应该在编辑版本中修复
-
您在“in M.insertWith'”开头的行中使用
seq是可疑的。您正在构建一个大型表达式并对其进行评估,然后丢弃结果并返回其他内容。你的意思是切换参数,即 lseqM.insertWith ...
标签: performance optimization haskell