现在的位置: 首页 > 综合 > 正文

LCS/最长公共子串算法分析

2019年11月04日 ⁄ 综合 ⁄ 共 4502字 ⁄ 字号 评论关闭

最长公共子串

一、最长公共子串(Longest Common Substring ,简称LCS)问题,是指求给定的一组字符串长度最大的共有的子串的问题。例如字符串"abcb","bca","acbc"的LCS就是"bc"。

求多串的LCS,显然穷举法是极端低效的算法。直观的思路就是问题要求什么就找出什么。要子串,就找子串;要相同,就比较每个字符;要最长就记录最长。所以很容易就可以想到如下的解法。

 

//暴力求解法 
int longestCommonSubstring_n3(const string& str1, const string& str2)
{   
    size_t size1 = str1.size();
    size_t size2 = str2.size();
    if (size1 == 0 || size2 == 0) return 0;
 
    // the start position of substring in original string
    int start1 = -1;
    int start2 = -1;
    // the longest length of common substring 
    int longest = 0; 
 
    // record how many comparisons the solution did;
    // it can be used to know which algorithm is better
    int comparisons = 0;
 
    for (int i = 0; i < size1; ++i)
    {
        for (int j = 0; j < size2; ++j)
        {
            // find longest length of prefix 
            int length = 0;
            int m = i;
            int n = j;
            while(m < size1 && n < size2)
            {
                ++comparisons;
                if (str1[m] != str2[n]) break;
 
                ++length;
                ++m;
                ++n;
            }
 
            if (longest < length)
            {
                longest = length;
                start1 = i;
                start2 = j;
            }
        }
    }
#ifdef IDER_DEBUG
    cout<< "(first, second, comparisions) = (" 
        << start1 << ", " << start2 << ", " << comparisons 
        << ")" << endl;
#endif
    return longest;
}

该解法的思路就如前所说,以字符串中的每个字符作为子串的端点,判定以此为开始的子串的相同字符最长能达到的长度。其实从表层上想,这个算法的复杂度应该只有O(n2)因为该算法把每个字符都成对相互比较一遍,但关键问题在于比较两个字符串的效率并非是O(1),这也导致了实际的时间复杂度应该是满足Ω(n2)和O(n3)。

二、动态规划法 – 空间换时间

上述解法虽然不是最优的,但是依然可以从中找到一个改进的线索。不难发现在子串比较中有很多次重复的比较。比如再比较以i和j分别为起始点字符串时,有可能会进行i+1和j+1以及i+2和j+2位置的字符的比较;而在比较i+1和j+1分别为起始点字符串时,这些字符又会被比较一次了。也就是说该问题有非常相似的子问题,而子问题之间又有重叠,这就给动态规划法的应该提供了契机。

暴力解法是从字符串开端开始找寻,现在换个思维考虑以字符结尾的子串来利用动态规划法。

假设两个字符串分别为s和t,s[i]和t[j]分别表示其第i和第j个字符(字符顺序从0开始),再令L[i, j]表示以s[i]和s[j]为结尾的相同子串的最大长度。应该不难递推出L[i, j]和L[i+1,j+1]之间的关系,因为两者其实只差s[i+1]和t[j+1]这一对字符。若s[i+1]和t[j+1]不同,那么L[i+1, j+1]自然应该是0,因为任何以它们为结尾的子串都不可能完全相同;而如果s[i+1]和t[j+1]相同,那么就只要在以s[i]和t[j]结尾的最长相同子串之后分别添上这两个字符即可,这样就可以让长度增加一位。合并上述两种情况,也就得L[i+1,j+1]=(s[i]==t[j]
?L[i,j]+1:0)这样的关系。

最后就是要小心的就是临界位置:如若两个字符串中任何一个是空串,那么最长公共子串的长度只能是0;当i为0时,L[0,j]应该是等于L[-1,j-1]再加上s[0]和t[j]提供的值,但L[-1,j-1]本是无效,但可以视s[-1]是空字符也就变成了前面一种临界情况,这样就可知L[-1,j-1]==0,所以L[0,j]=(s[0]==t[j]?1:0)。对于j为0也是一样的,同样可得L[i,0]=(s[i]==t[0]?1:0)。

 

//动态规划法1
public static int lcs(String str1, String str2) {
  int len1 = str1.length();
  int len2 = str2.length();
  int result = 0;     //记录最长公共子串长度
  int c[][] = new int[len1+1][len2+1];
  for (int i = 0; i <= len1; i++) {
    for( int j = 0; j <= len2; j++) {
      if(i == 0 || j == 0) {
        c[i][j] = 0;
      } else if (str1.charAt(i-1) == str2.charAt(j-1)) {
        c[i][j] = c[i-1][j-1] + 1;
        result = max(c[i][j], result);
      } else {
        c[i][j] = 0;
      }
    }
  }
  return result;
}
//动态规划法2
 int longestCommonSubstring_n2_n2(const string& str1, const string& str2) 
{
    size_t size1 = str1.size();
    size_t size2 = str2.size();
    if (size1 == 0 || size2 == 0) return 0;
 
    vector<vector<int> > table(size1, vector<int>(size2, 0));
    // the start position of substring in original string
    int start1 = -1;
    int start2 = -1;
    // the longest length of common substring 
    int longest = 0; 
 
    // record how many comparisons the solution did;
    // it can be used to know which algorithm is better
    int comparisons = 0;
    for (int j = 0; j < size2; ++j)
    {
        ++comparisons;
        table[0][j] = (str1[0] == str2[j] ? 1 :0);
    }
 
    for (int i = 1; i < size1; ++i)
    {
        ++comparisons;
        table[i][0] = (str1[i] == str2[0] ? 1 :0);
 
        for (int j = 1; j < size2; ++j)
        {
            ++comparisons;
            if (str1[i] == str2[j])
            {
                table[i][j] = table[i-1][j-1]+1;
            }
        }
    }
 
    for (int i = 0; i < size1; ++i)
    {
        for (int j = 0; j < size2; ++j)
        {
            if (longest < table[i][j])
            {
                longest = table[i][j];
                start1 = i-longest+1;
                start2 = j-longest+1;
            }
        }
    }
#ifdef IDER_DEBUG
    cout<< "(first, second, comparisions) = (" 
        << start1 << ", " << start2 << ", " << comparisons 
        << ")" << endl;
#endif
 
    return longest;
}

动态规划法优化 – 能省一点是一点

仔细回顾之前的代码,其实可以做一些合并让代码变得更加简洁,比如最后一个求最长的嵌套for循环其实可以合并到之前计算整个表的for循环之中,每计算完L[i,j]就检查它是的值是不是更长。当合并代码之后,就会发现内部循环的过程重其实只用到了整个表的相邻两行而已,对于其它已经计算好的行之后就再也不会用到,而未计算的行曽之前也不会用到,因此考虑只用两行来存储计算值可能就足够。

于是新的经过再次优化的算法就有了:

 

//动态规划优化法
int longestCommonSubstring_n2_2n(const string& str1, const string& str2)
{   
    size_t size1 = str1.size();
    size_t size2 = str2.size();
    if (size1 == 0 || size2 == 0) return 0;
 
    vector<vector<int> > table(2, vector<int>(size2, 0));
 
    // the start position of substring in original string
    int start1 = -1;
    int start2 = -1;
    // the longest length of common substring 
    int longest = 0; 
 
    // record how many comparisons the solution did;
    // it can be used to know which algorithm is better
    int comparisons = 0;
    for (int j = 0; j < size2; ++j)
    {
        ++comparisons;
        if (str1[0] == str2[j]) 
        {
            table[0][j] = 1;
            if (longest == 0) 
            {
                longest = 1;
                start1 = 0;
                start2 = j;
            }
        }
    }
 
    for (int i = 1; i < size1; ++i)
    {
        ++comparisons;
        // with odd/even to swith working row
        int cur = ((i&1) == 1); //index for current working row
        int pre = ((i&1) == 0); //index for previous working row
        table[cur][0] = 0;
        if (str1[i] == str2[0]) 
        {
            table[cur][0] = 1;
            if (longest == 0) 
            {
                longest = 1;
                start1 = i;
                start2 = 0;
            }
        }
 
        for (int j = 1; j < size2; ++j)
        {
            ++comparisons;
            if (str1[i] == str2[j])
            {
                table[cur][j] = table[pre][j-1]+1;
                if (longest < table[cur][j])
                {
                    longest = table[cur][j];
                    start1 = i-longest+1;
                    start2 = j-longest+1;
                }
            }
            else 
            {
                table[cur][j] = 0;
            }
        }
    }
 
#ifdef IDER_DEBUG
    cout<< "(first, second, comparisions) = (" 
        << start1 << ", " << start2 << ", " << comparisons 
        << ")" << endl;
#endif
 
    return longest;
} 

更优秀的有广义后缀树的方法,时间可以达到 O(NL)。我还没有搞明白。。。。。

 

抱歉!评论已关闭.