如何加快编程的计算速度？ [关闭]答案

【问题标题】：how speed up the calculations of the programming? [closed]如何加快编程的计算速度？ [关闭]
【发布时间】：2015-02-28 04:57:25
【问题描述】：

对两个文件中的两列标题进行标题匹配。标题可以看作是字符串，文件 A 162283 行 X 12 列。文件 B 3695 行 X 6 列。我使用了 levenshtein 算法。对文件 B 中第 4 列的每一行，计算它与文件 A 中第 5 列的每一行的相似度，找出 A 中相似度最高的标题，将标题附加到文件 B 中的对应行上，同时来自文件 A 的 ID。

在计算相似度之前，我删除了字符串中的一些符号和单词，例如“:”、“-”、“season”、“episode”。这么简单的编程，这么大的数据，竟然用了200多分钟。我想知道为什么。

我先写了一个python程序，花了很长时间，然后我写了一个c++程序，花了更长的时间。为什么？

请参阅以下程序：

蟒蛇：

import csv
import re
import difflib

import operator

import Levenshtein
import datetime

import glob

import os

import fnmatch




a=[]

with open("D:\\A.txt","rb") as f:
      for row in f:
          a.append(row.split("\t"))

f.close()






b=[]
with open("B.txt","rb") as k:
       for row in k:
           b.append(row.split("\t"))

k.close()


dd={}

ee={}

my_list=[]







for i in range(len(a)):
          ff={}
#          max_value=0
          for j in range(len(b)):
             s1=re.sub(r',',' ',a[i][3])
             s1=s1.lower()
             s2=re.sub(r',',' ',b[j][4])
             s2=s2.lower()
             s1=re.sub(r'series',' ',s1)
             s1=re.sub(r'episode',' ',s1)
             s2=re.sub(r'series',' ',s2)
             s2=re.sub(r'episode',' ',s2)
             s1=re.sub(r'season',' ',s1)
             s2=re.sub(r'season',' ',s2)
             s1=re.sub(r'"',' ',s1)
             s2=re.sub(r'"',' ',s2)
             s1=re.sub(r'-',' ',s1)
             s2=re.sub(r'-',' ',s2)
             s2=re.sub(r':',' ',s2)
             s1=re.sub(r':',' ',s1)
             s1=re.sub(r' ','',s1)
             s2=re.sub(r' ','',s2)
             d=float(Levenshtein.ratio(s1,s2))
             ff[b[j][4]+"\t"+str(b[j][11])]=d
#             max_value=float(max(max_value,d))
             qq="\t".join(a[i])
          dd[qq]=max(ff.iteritems(),key=operator.itemgetter(1))[0]
          my_list.append([qq.strip()+"\t"+dd[qq]])
datestr=datetime.date.today().strftime("%y%m%d")
filename="good2_codes_{}".format(datestr)+'.txt'
File=open("C”+filename,'w')
for item in my_list:
    File.write(str(item[0])+"\n")
File.close()

c++:

#include <string>
#include<iostream>
#include <algorithm>
#include <fstream>
#include <boost/unordered_map.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <vector>
#include <boost/algorithm/string/replace.hpp>


using namespace std;

size_t uiLevenshteinDistance (const std::string &s1, const std::string &s2)
{ const size_t m(s1.size());
  const size_t n(s2.size());
  if(m==0) return n;
  if(n==0) return m;

  size_t *costs=new size_t[n+1];
  for(size_t k=0;k<=n;k++) costs[k]=k;

   size_t i=0;

  for (std::string::const_iterator it1=s1.begin(); it1!=s1.end();++it1,++i)

  {costs[0]=i+1;
   size_t corner=i;
   size_t j=0;
   for(std::string::const_iterator it2=s2.begin();it2!=s2.end();++it2,++j)

   {
     size_t upper=costs[j+1];
     if(*it1==*it2)
     {
         costs[j+1]=corner;
     }

    else 
        { size_t t(upper<corner?upper:corner);
         costs[j+1]=(costs[j]<t?costs[j]:t)+1;
        }

      corner=upper;
     }
  }

  size_t result=costs[n];
   delete [] costs;
   return result;

}

int main()

{ 
  std::vector<std::string> lines;
  std::ifstream file("A.txt");

  std::string line;
  while (std::getline(file,line)) {
       lines.push_back(line);
  }
  std::vector<std::string> foxs;
  std::ifstream file1("B.txt");

  std::string fox;
  while (std::getline(file1,fox)) {
       foxs.push_back(fox);
  }
 boost::unordered_map<std::string, std::string> hashtable1;

 for (int i=0; i< (int) lines.size(); i++)
 { boost::unordered_map<std::string, float> hashtable;
  for (int j=0; j<(int) foxs.size(); j++)

{    
    std::string str=lines[i];
  std::vector<std::string> tokens;
  boost::split(tokens,str,boost::algorithm::is_any_of("\t"));
   std::string str1=foxs[j];
   std::vector<std::string> tokens1;

  boost::split(tokens1,str1,boost::algorithm::is_any_of("\t"));
     std::string  s1=tokens[3];
     std::string  s2=tokens1[4];
       boost::algorithm::to_lower(s1);
       boost::algorithm::to_lower(s2);
  boost::replace_all(s1,",","");
 boost::replace_all(s2,",","");
 boost::replace_all(s1,"-","");
boost::replace_all(s2,"-","");
boost::replace_all(s1,"season","");
boost::replace_all(s2,"season","");
boost::replace_all(s1,"episode","");
boost::replace_all(s2,"episode","");
boost::replace_all(s1,"series","");
boost::replace_all(s2,"series","");




//  size_t f = s1.find(",");
//  s1.replace(f, std::string(",").length(),"");
//  size_t f1=s2.find(",");
//   s2.replace(f1, std::string(",").length(),"");
//   size_t f2 = s1.find("season");
//  s1.replace(f2, std::string("season").length(),"");
//  size_t f3=s2.find("season");
//   s2.replace(f3, std::string(",").length(),"");
//  size_t f4 = s1.find("episode");
//  s1.replace(f4, std::string("episode").length(),"");
//  size_t f5=s2.find("episode");
//   s2.replace(f5, std::string("episode").length(),"");
//   size_t f6 = s1.find("series");
//  s1.replace(f6, std::string("series").length(),"");
//  size_t f7=s2.find("series");
//   s2.replace(f7, std::string("series").length(),"");
 s1.erase(remove( s1.begin(), s1.end(), '\"' ),s1.end());   
 s2.erase(remove( s2.begin(), s2.end(), '\"' ),s2.end());
//size_t f10 = s1.find("-");
//  s1.replace(f10, std::string("-").length(),"");
//  size_t f11=s2.find("-");
//   s2.replace(f11, std::string("-").length(),"");
  boost::replace_all(s1," ","");
  boost::replace_all(s2," ","");

 float k,k2,k3;
  k=float (std::max(s1.size(),s2.size()));
  k2=float ( uiLevenshteinDistance(s1,s2));
  k3=1-k2/k;
  hashtable.insert(make_pair(tokens1[4]+"\t"+(std::string)tokens1[11],k3));
  }

  float max=0;
std::string max_key;
for (auto itr=hashtable.begin(); itr !=hashtable.end(); itr++)
{
  if ((*itr).second>max)
{ 
  max=(*itr).second;
  max_key=(*itr).first;
}
}
hashtable1.insert(make_pair(lines[i],max_key));

}

for (auto itr1=hashtable1.begin(); itr1 !=hashtable1.end(); itr1++)
 cout << (*itr1).first << "\t" << (*itr1).second << endl;



   return 0;

}

【问题讨论】：

标签： python c++ performance algorithm levenshtein-distance

【解决方案1】：

因为您正在进行len(a) * len(b) 编辑距离计算。 Levenshtein 编辑距离不是用于这种匹配的工具；尝试减少问题集；将字符串规范化为小写，删除标点符号，拆分为标记 - 使用一些词干算法，如 Porter、Snowball；之后，您可以过滤掉不共享任何或大量单词的对；只有当你显着减少问题集时，你才应该对它们使用 Levenshtein。

而Levenshtein python 模块之所以能与你的 C++ 实现竞争，是因为 Python 模块是用 C 编写的。

【讨论】：

【解决方案2】：

#include <string>
#include<iostream>
#include <algorithm>
#include <fstream>
#include <boost/unordered_map.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <vector>
#include <boost/algorithm/string/replace.hpp>


using namespace std;

size_t uiLevenshteinDistance(const std::string &s1, const std::string &s2)
{
    const size_t m(s1.size());
    const size_t n(s2.size());
    if(m == 0) return n;
    if(n == 0) return m;

    size_t *costs = new size_t[n + 1];
    for(size_t k = 0; k <= n; k++) costs[k] = k;

    size_t i = 0;

    for(std::string::const_iterator it1 = s1.begin(); it1 != s1.end(); ++it1, ++i)

    {
        costs[0] = i + 1;
        size_t corner = i;
        size_t j = 0;
        for(std::string::const_iterator it2 = s2.begin(); it2 != s2.end(); ++it2, ++j)

        {
            size_t upper = costs[j + 1];
            if(*it1 == *it2)
            {
                costs[j + 1] = corner;
            }

            else
            {
                size_t t(upper<corner ? upper : corner);
                costs[j + 1] = (costs[j]<t ? costs[j] : t) + 1;
            }

            corner = upper;
        }
    }

    size_t result = costs[n];
    delete[] costs;
    return result;

}

int main()

{
    std::vector<std::string> lines;
    std::ifstream file("A.txt");

    std::string line;
    while(std::getline(file, line)) {
        lines.push_back(line);
    }
    std::vector<std::string> foxs;
    std::ifstream file1("B.txt");

    std::string fox;
    while(std::getline(file1, fox)) {
        foxs.push_back(fox);
    }
    boost::unordered_map<std::string, std::string> hashtable1;
    std::vector<std::string> tokens;
    std::vector<std::string> s1s;
    for(int i = 0; i < (int)lines.size(); i++)
    {
        std::string str = lines[i];
        boost::split(tokens, str, boost::algorithm::is_any_of("\t"));
        std::string  s1 = tokens[3];
        boost::algorithm::to_lower(s1);
        boost::replace_all(s1, ",", "");
        boost::replace_all(s1, "-", "");
        boost::replace_all(s1, "season", "");
        boost::replace_all(s1, "episode", "");
        boost::replace_all(s1, "series", "");

        s1.erase(remove(s1.begin(), s1.end(), '\"'), s1.end());
        boost::replace_all(s1, " ", "");
        s1s.push_back(s1);
    }

    std::vector<std::string> tokens1;
    std::vector<std::string> s2s;
    for(int j = 0; j < (int)foxs.size(); j++)
    {
        std::string str1 = foxs[j];

        boost::split(tokens1, str1, boost::algorithm::is_any_of("\t"));
        std::string  s2 = tokens1[4];
        boost::algorithm::to_lower(s2);
        boost::replace_all(s2, ",", "");
        boost::replace_all(s2, "-", "");
        boost::replace_all(s2, "season", "");
        boost::replace_all(s2, "episode", "");
        boost::replace_all(s2, "series", "");
        s2.erase(remove(s2.begin(), s2.end(), '\"'), s2.end());
        boost::replace_all(s2, " ", "");
        s2s.push_back(s2);
    }

    for(int i = 0; i< (int)lines.size(); i++)
    {
        boost::unordered_map<std::string, float> hashtable;
        for(int j = 0; j<(int)foxs.size(); j++)

        {
            float k, k2, k3;
            k = float(std::max(s1s[i].size(), s2s[j].size()));
            k2 = float(uiLevenshteinDistance(s1s[i], s2s[j]));
            k3 = 1 - k2 / k;
            hashtable.insert(make_pair(tokens1[4] + "\t" + (std::string)tokens1[11], k3));
        }

        float max = 0;
        std::string max_key;
        for(auto itr = hashtable.begin(); itr != hashtable.end(); itr++)
        {
            if((*itr).second>max)
            {
                max = (*itr).second;
                max_key = (*itr).first;
            }
        }
        hashtable1.insert(make_pair(lines[i], max_key));

    }

    for(auto itr1 = hashtable1.begin(); itr1 != hashtable1.end(); itr1++)
        cout << (*itr1).first << "\t" << (*itr1).second << endl;



    return 0;

}

【讨论】：