【问题标题】:how speed up the calculations of the programming? [closed]如何加快编程的计算速度? [关闭]
【发布时间】:2015-02-28 04:57:25
【问题描述】:

对两个文件中的两列标题进行标题匹配。标题可以看作是字符串,文件 A 162283 行 X 12 列。文件 B 3695 行 X 6 列。我使用了 levenshtein 算法。对文件 B 中第 4 列的每一行,计算它与文件 A 中第 5 列的每一行的相似度,找出 A 中相似度最高的标题,将标题附加到文件 B 中的对应行上,同时来自文件 A 的 ID。

在计算相似度之前,我删除了字符串中的一些符号和单词,例如“:”、“-”、“season”、“episode”。这么简单的编程,这么大的数据,竟然用了200多分钟。我想知道为什么。

我先写了一个python程序,花了很长时间,然后我写了一个c++程序,花了更长的时间。为什么?

请参阅以下程序:

蟒蛇:

import csv
import re
import difflib

import operator

import Levenshtein
import datetime

import glob

import os

import fnmatch




a=[]

with open("D:\\A.txt","rb") as f:
      for row in f:
          a.append(row.split("\t"))

f.close()






b=[]
with open("B.txt","rb") as k:
       for row in k:
           b.append(row.split("\t"))

k.close()


dd={}

ee={}

my_list=[]







for i in range(len(a)):
          ff={}
#          max_value=0
          for j in range(len(b)):
             s1=re.sub(r',',' ',a[i][3])
             s1=s1.lower()
             s2=re.sub(r',',' ',b[j][4])
             s2=s2.lower()
             s1=re.sub(r'series',' ',s1)
             s1=re.sub(r'episode',' ',s1)
             s2=re.sub(r'series',' ',s2)
             s2=re.sub(r'episode',' ',s2)
             s1=re.sub(r'season',' ',s1)
             s2=re.sub(r'season',' ',s2)
             s1=re.sub(r'"',' ',s1)
             s2=re.sub(r'"',' ',s2)
             s1=re.sub(r'-',' ',s1)
             s2=re.sub(r'-',' ',s2)
             s2=re.sub(r':',' ',s2)
             s1=re.sub(r':',' ',s1)
             s1=re.sub(r' ','',s1)
             s2=re.sub(r' ','',s2)
             d=float(Levenshtein.ratio(s1,s2))
             ff[b[j][4]+"\t"+str(b[j][11])]=d
#             max_value=float(max(max_value,d))
             qq="\t".join(a[i])
          dd[qq]=max(ff.iteritems(),key=operator.itemgetter(1))[0]
          my_list.append([qq.strip()+"\t"+dd[qq]])
datestr=datetime.date.today().strftime("%y%m%d")
filename="good2_codes_{}".format(datestr)+'.txt'
File=open("C”+filename,'w')
for item in my_list:
    File.write(str(item[0])+"\n")
File.close()

c++:

#include <string>
#include<iostream>
#include <algorithm>
#include <fstream>
#include <boost/unordered_map.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <vector>
#include <boost/algorithm/string/replace.hpp>


using namespace std;

size_t uiLevenshteinDistance (const std::string &s1, const std::string &s2)
{ const size_t m(s1.size());
  const size_t n(s2.size());
  if(m==0) return n;
  if(n==0) return m;

  size_t *costs=new size_t[n+1];
  for(size_t k=0;k<=n;k++) costs[k]=k;

   size_t i=0;

  for (std::string::const_iterator it1=s1.begin(); it1!=s1.end();++it1,++i)

  {costs[0]=i+1;
   size_t corner=i;
   size_t j=0;
   for(std::string::const_iterator it2=s2.begin();it2!=s2.end();++it2,++j)

   {
     size_t upper=costs[j+1];
     if(*it1==*it2)
     {
         costs[j+1]=corner;
     }

    else 
        { size_t t(upper<corner?upper:corner);
         costs[j+1]=(costs[j]<t?costs[j]:t)+1;
        }

      corner=upper;
     }
  }

  size_t result=costs[n];
   delete [] costs;
   return result;

}

int main()

{ 
  std::vector<std::string> lines;
  std::ifstream file("A.txt");

  std::string line;
  while (std::getline(file,line)) {
       lines.push_back(line);
  }
  std::vector<std::string> foxs;
  std::ifstream file1("B.txt");

  std::string fox;
  while (std::getline(file1,fox)) {
       foxs.push_back(fox);
  }
 boost::unordered_map<std::string, std::string> hashtable1;

 for (int i=0; i< (int) lines.size(); i++)
 { boost::unordered_map<std::string, float> hashtable;
  for (int j=0; j<(int) foxs.size(); j++)

{    
    std::string str=lines[i];
  std::vector<std::string> tokens;
  boost::split(tokens,str,boost::algorithm::is_any_of("\t"));
   std::string str1=foxs[j];
   std::vector<std::string> tokens1;

  boost::split(tokens1,str1,boost::algorithm::is_any_of("\t"));
     std::string  s1=tokens[3];
     std::string  s2=tokens1[4];
       boost::algorithm::to_lower(s1);
       boost::algorithm::to_lower(s2);
  boost::replace_all(s1,",","");
 boost::replace_all(s2,",","");
 boost::replace_all(s1,"-","");
boost::replace_all(s2,"-","");
boost::replace_all(s1,"season","");
boost::replace_all(s2,"season","");
boost::replace_all(s1,"episode","");
boost::replace_all(s2,"episode","");
boost::replace_all(s1,"series","");
boost::replace_all(s2,"series","");




//  size_t f = s1.find(",");
//  s1.replace(f, std::string(",").length(),"");
//  size_t f1=s2.find(",");
//   s2.replace(f1, std::string(",").length(),"");
//   size_t f2 = s1.find("season");
//  s1.replace(f2, std::string("season").length(),"");
//  size_t f3=s2.find("season");
//   s2.replace(f3, std::string(",").length(),"");
//  size_t f4 = s1.find("episode");
//  s1.replace(f4, std::string("episode").length(),"");
//  size_t f5=s2.find("episode");
//   s2.replace(f5, std::string("episode").length(),"");
//   size_t f6 = s1.find("series");
//  s1.replace(f6, std::string("series").length(),"");
//  size_t f7=s2.find("series");
//   s2.replace(f7, std::string("series").length(),"");
 s1.erase(remove( s1.begin(), s1.end(), '\"' ),s1.end());   
 s2.erase(remove( s2.begin(), s2.end(), '\"' ),s2.end());
//size_t f10 = s1.find("-");
//  s1.replace(f10, std::string("-").length(),"");
//  size_t f11=s2.find("-");
//   s2.replace(f11, std::string("-").length(),"");
  boost::replace_all(s1," ","");
  boost::replace_all(s2," ","");

 float k,k2,k3;
  k=float (std::max(s1.size(),s2.size()));
  k2=float ( uiLevenshteinDistance(s1,s2));
  k3=1-k2/k;
  hashtable.insert(make_pair(tokens1[4]+"\t"+(std::string)tokens1[11],k3));
  }

  float max=0;
std::string max_key;
for (auto itr=hashtable.begin(); itr !=hashtable.end(); itr++)
{
  if ((*itr).second>max)
{ 
  max=(*itr).second;
  max_key=(*itr).first;
}
}
hashtable1.insert(make_pair(lines[i],max_key));

}

for (auto itr1=hashtable1.begin(); itr1 !=hashtable1.end(); itr1++)
 cout << (*itr1).first << "\t" << (*itr1).second << endl;



   return 0;

}

【问题讨论】:

    标签: python c++ performance algorithm levenshtein-distance


    【解决方案1】:

    因为您正在进行len(a) * len(b) 编辑距离计算。 Levenshtein 编辑距离不是用于这种匹配的工具;尝试减少问题集;将字符串规范化为小写,删除标点符号,拆分为标记 - 使用一些词干算法,如 Porter、Snowball;之后,您可以过滤掉不共享任何或大量单词的对;只有当你显着减少问题集时,你才应该对它们使用 Levenshtein。

    Levenshtein python 模块之所以能与你的 C++ 实现竞争,是因为 Python 模块是用 C 编写的。

    【讨论】:

      【解决方案2】:
      #include <string>
      #include<iostream>
      #include <algorithm>
      #include <fstream>
      #include <boost/unordered_map.hpp>
      #include <boost/algorithm/string.hpp>
      #include <boost/algorithm/string/classification.hpp>
      #include <vector>
      #include <boost/algorithm/string/replace.hpp>
      
      
      using namespace std;
      
      size_t uiLevenshteinDistance(const std::string &s1, const std::string &s2)
      {
          const size_t m(s1.size());
          const size_t n(s2.size());
          if(m == 0) return n;
          if(n == 0) return m;
      
          size_t *costs = new size_t[n + 1];
          for(size_t k = 0; k <= n; k++) costs[k] = k;
      
          size_t i = 0;
      
          for(std::string::const_iterator it1 = s1.begin(); it1 != s1.end(); ++it1, ++i)
      
          {
              costs[0] = i + 1;
              size_t corner = i;
              size_t j = 0;
              for(std::string::const_iterator it2 = s2.begin(); it2 != s2.end(); ++it2, ++j)
      
              {
                  size_t upper = costs[j + 1];
                  if(*it1 == *it2)
                  {
                      costs[j + 1] = corner;
                  }
      
                  else
                  {
                      size_t t(upper<corner ? upper : corner);
                      costs[j + 1] = (costs[j]<t ? costs[j] : t) + 1;
                  }
      
                  corner = upper;
              }
          }
      
          size_t result = costs[n];
          delete[] costs;
          return result;
      
      }
      
      int main()
      
      {
          std::vector<std::string> lines;
          std::ifstream file("A.txt");
      
          std::string line;
          while(std::getline(file, line)) {
              lines.push_back(line);
          }
          std::vector<std::string> foxs;
          std::ifstream file1("B.txt");
      
          std::string fox;
          while(std::getline(file1, fox)) {
              foxs.push_back(fox);
          }
          boost::unordered_map<std::string, std::string> hashtable1;
          std::vector<std::string> tokens;
          std::vector<std::string> s1s;
          for(int i = 0; i < (int)lines.size(); i++)
          {
              std::string str = lines[i];
              boost::split(tokens, str, boost::algorithm::is_any_of("\t"));
              std::string  s1 = tokens[3];
              boost::algorithm::to_lower(s1);
              boost::replace_all(s1, ",", "");
              boost::replace_all(s1, "-", "");
              boost::replace_all(s1, "season", "");
              boost::replace_all(s1, "episode", "");
              boost::replace_all(s1, "series", "");
      
              s1.erase(remove(s1.begin(), s1.end(), '\"'), s1.end());
              boost::replace_all(s1, " ", "");
              s1s.push_back(s1);
          }
      
          std::vector<std::string> tokens1;
          std::vector<std::string> s2s;
          for(int j = 0; j < (int)foxs.size(); j++)
          {
              std::string str1 = foxs[j];
      
              boost::split(tokens1, str1, boost::algorithm::is_any_of("\t"));
              std::string  s2 = tokens1[4];
              boost::algorithm::to_lower(s2);
              boost::replace_all(s2, ",", "");
              boost::replace_all(s2, "-", "");
              boost::replace_all(s2, "season", "");
              boost::replace_all(s2, "episode", "");
              boost::replace_all(s2, "series", "");
              s2.erase(remove(s2.begin(), s2.end(), '\"'), s2.end());
              boost::replace_all(s2, " ", "");
              s2s.push_back(s2);
          }
      
          for(int i = 0; i< (int)lines.size(); i++)
          {
              boost::unordered_map<std::string, float> hashtable;
              for(int j = 0; j<(int)foxs.size(); j++)
      
              {
                  float k, k2, k3;
                  k = float(std::max(s1s[i].size(), s2s[j].size()));
                  k2 = float(uiLevenshteinDistance(s1s[i], s2s[j]));
                  k3 = 1 - k2 / k;
                  hashtable.insert(make_pair(tokens1[4] + "\t" + (std::string)tokens1[11], k3));
              }
      
              float max = 0;
              std::string max_key;
              for(auto itr = hashtable.begin(); itr != hashtable.end(); itr++)
              {
                  if((*itr).second>max)
                  {
                      max = (*itr).second;
                      max_key = (*itr).first;
                  }
              }
              hashtable1.insert(make_pair(lines[i], max_key));
      
          }
      
          for(auto itr1 = hashtable1.begin(); itr1 != hashtable1.end(); itr1++)
              cout << (*itr1).first << "\t" << (*itr1).second << endl;
      
      
      
          return 0;
      
      }
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2023-03-28
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多