【发布时间】:2020-03-05 22:31:00
【问题描述】:
我目前正在尝试在Node中实现SPIMI索引构造方法,但遇到了问题。
代码如下:
let fs = require("fs");
let path = require("path");
module.exports = {
fileStream: function (dirPath, fileStream) {
return buildFileStream(dirPath, fileStream);
},
buildSpimi: function (fileStream, outDir) {
let invIndex = {};
let sortedInvIndex = {};
let fileNameCount = 1;
let outputTXT = "";
let entryCounter = 0;
let resString = "";
fileStream.forEach((filePath, fileIndex) => {
let data = fs.readFileSync(filePath).toString('utf-8');
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(function (ch) { return ch.length != 0; });
data.forEach(token => {
//CHANGE THE SIZE IF NECESSARY (4e+?)
if (entryCounter > 100000) {
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
resString = "";
entryCounter = 0;
sortedInvIndex = {};
invIndex = {};
console.log(outputTXT + " - written;");
fileNameCount++;
};
if (invIndex[token] == undefined) {
invIndex[token] = [];
entryCounter++;
};
if (!invIndex[token].includes(fileIndex)) {
invIndex[token].push(fileIndex);
entryCounter++;
};
});
});
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
console.log(outputTXT + " - written;");
}
}
function buildFileStream(dirPath, fileStream) {
fileStream = fileStream || 0;
fs.readdirSync(dirPath).forEach(function (file) {
let filepath = path.join(dirPath, file);
let stat = fs.statSync(filepath);
if (stat.isDirectory()) {
fileStream = buildFileStream(filepath, fileStream);
} else {
fileStream.push(filepath);
}
});
return fileStream;
}
我在单独的文件中使用导出的函数:
let spimi = require("./spimi");
let outputDir = "/Users/me/Desktop/SPIMI_OUT/"
let inputDir = "/Users/me/Desktop/gutenberg/2/2";
fileStream = [];
let result = spimi.fileStream(inputDir, fileStream);
console.table(result)
console.log("Finished building the filestream");
let t0 = new Date();
spimi.buildSpimi(result, outputDir);
let t1 = new Date();
console.log(t1 - t0);
虽然这种代码在尝试处理相对较小的数据量时有效(我测试了高达 1.5 GB),但显然某处存在内存泄漏,因为在监控 RAM 使用情况时,我可以看到它上升到4-5 GB)。
我花了很多时间试图找出可能的原因,但我仍然找不到问题所在。
我将不胜感激任何提示! 谢谢!
【问题讨论】:
-
你试过
writeFileSync吗?
标签: node.js memory-leaks