现在的位置: 首页 > 综合 > 正文

后缀数组

2012年08月29日 ⁄ 综合 ⁄ 共 11314字 ⁄ 字号 评论关闭

  

  一些概念和理解:

Suffix(i): 表示从i位置到字符串末尾这个子串,(后缀数组);

sa[i]:表示排名为i的子串从哪个位置开始,即排名为i的子串为Suffix(sa[i]);

rank[i]:表示在字符串上,第i位置在后缀数组中的排名是多少;

height[i]:height[i] = Suffix(sa[i-1]) 和Suffix(sa[i])的最大公共前缀;

假设有j、k,且rank[j] < rank[k],则有:Suffix(j)和Suffix(k)的最大公共前缀为height[rank[j] + 1], height[rank[j] + 2], ... , height[rank[k]]中的最小值;

关于最长公共前缀:

求两个后缀的最长公共前缀即为求height数组中某区间上的最小值,显然这个可以用rmq问题解决。O(nlogn)的时间预处理,O(1)的时间询问。

ps:后缀数组还有好多应用,详见 罗穗骞《后缀数组——处理字符串的有力工具》

模板:

倍增算法

View Code

#define maxn 1000001
int wa[maxn],wb[maxn],wv[maxn],ws[maxn];
int cmp(int *r,int a,int b,int l)
{return r[a]==r[b]&&r[a+l]==r[b+l];}
void da(int *r,int *sa,int n,int m)
{
     int i,j,p,*x=wa,*y=wb,*t;
     for(i=0;i<m;i++) ws[i]=0;
     for(i=0;i<n;i++) ws[x[i]=r[i]]++;
     for(i=1;i<m;i++) ws[i]+=ws[i-1];
     for(i=n-1;i>=0;i--) sa[--ws[x[i]]]=i;
     for(j=1,p=1;p<n;j*=2,m=p)
     {
       for(p=0,i=n-j;i<n;i++) y[p++]=i;
       for(i=0;i<n;i++) if(sa[i]>=j) y[p++]=sa[i]-j;
       for(i=0;i<n;i++) wv[i]=x[y[i]];
       for(i=0;i<m;i++) ws[i]=0;
       for(i=0;i<n;i++) ws[wv[i]]++;
       for(i=1;i<m;i++) ws[i]+=ws[i-1];
       for(i=n-1;i>=0;i--) sa[--ws[wv[i]]]=y[i];
       for(t=x,x=y,y=t,p=1,x[sa[0]]=0,i=1;i<n;i++)
       x[sa[i]]=cmp(y,sa[i-1],sa[i],j)?p-1:p++;
     }
     return;
}
int rank[maxn],height[maxn];
void calheight(int *r,int *sa,int n)
{
     int i,j,k=0;
     for(i=1;i<=n;i++) rank[sa[i]]=i;
     for(i=0;i<n;height[rank[i++]]=k)
     for(k?k--:0,j=sa[rank[i]-1];r[i+k]==r[j+k];k++);
     return;
}
int RMQ[maxn];
int mm[maxn];
int best[20][maxn];
void initRMQ(int n)
{
     int i,j,a,b;
     for(i = 1; i <= n; ++i) RMQ[i] = height[i];
     for(mm[0]=-1,i=1;i<=n;i++)
     mm[i]=((i&(i-1))==0)?mm[i-1]+1:mm[i-1];
     for(i=1;i<=n;i++) best[0][i]=i;
     for(i=1;i<=mm[n];i++)
     for(j=1;j<=n+1-(1<<i);j++)
     {
       a=best[i-1][j];
       b=best[i-1][j+(1<<(i-1))];
       if(RMQ[a]<RMQ[b]) best[i][j]=a;
       else best[i][j]=b;
     }
     return;
}
int askRMQ(int a,int b)
{
    int t;
    t=mm[b-a+1];b-=(1<<t)-1;
    a=best[t][a];b=best[t][b];
    return RMQ[a]<RMQ[b]?a:b;
}
int lcp(int a,int b)
{
    int t;
    a=rank[a];b=rank[b];
    if(a>b) {t=a;a=b;b=t;}
    return(height[askRMQ(a+1,b)]);
}

dc3:

View Code

#define maxn 1000003       //注意扩大
#define F(x) ((x)/3+((x)%3==1?0:tb))
#define G(x) ((x)<tb?(x)*3+1:((x)-tb)*3+2)
int wa[maxn],wb[maxn],wv[maxn],ws[maxn];
int c0(int *r,int a,int b)
{return r[a]==r[b]&&r[a+1]==r[b+1]&&r[a+2]==r[b+2];}
int c12(int k,int *r,int a,int b)
{if(k==2) return r[a]<r[b]||r[a]==r[b]&&c12(1,r,a+1,b+1);
 else return r[a]<r[b]||r[a]==r[b]&&wv[a+1]<wv[b+1];}
void sort(int *r,int *a,int *b,int n,int m)
{
     int i;
     for(i=0;i<n;i++) wv[i]=r[a[i]];
     for(i=0;i<m;i++) ws[i]=0;
     for(i=0;i<n;i++) ws[wv[i]]++;
     for(i=1;i<m;i++) ws[i]+=ws[i-1];
     for(i=n-1;i>=0;i--) b[--ws[wv[i]]]=a[i];
     return;
}
void dc3(int *r,int *sa,int n,int m)
{
     int i,j,*rn=r+n,*san=sa+n,ta=0,tb=(n+1)/3,tbc=0,p;
     r[n]=r[n+1]=0;
     for(i=0;i<n;i++) if(i%3!=0) wa[tbc++]=i;
     sort(r+2,wa,wb,tbc,m);
     sort(r+1,wb,wa,tbc,m);
     sort(r,wa,wb,tbc,m);
     for(p=1,rn[F(wb[0])]=0,i=1;i<tbc;i++)
     rn[F(wb[i])]=c0(r,wb[i-1],wb[i])?p-1:p++;
     if(p<tbc) dc3(rn,san,tbc,p);
     else for(i=0;i<tbc;i++) san[rn[i]]=i;
     for(i=0;i<tbc;i++) if(san[i]<tb) wb[ta++]=san[i]*3;
     if(n%3==1) wb[ta++]=n-1;
     sort(r,wb,wa,ta,m);
     for(i=0;i<tbc;i++) wv[wb[i]=G(san[i])]=i;
     for(i=0,j=0,p=0;i<ta && j<tbc;p++)
     sa[p]=c12(wb[j]%3,r,wa[i],wb[j])?wa[i++]:wb[j++];
     for(;i<ta;p++) sa[p]=wa[i++];
     for(;j<tbc;p++) sa[p]=wb[j++];
     return;
}
int rank[maxn],height[maxn];
void calheight(int *r,int *sa,int n)
{
     int i,j,k=0;
     for(i=1;i<=n;i++) rank[sa[i]]=i;
     for(i=0;i<n;height[rank[i++]]=k)
     for(k?k--:0,j=sa[rank[i]-1];r[i+k]==r[j+k];k++);
     return;
}
int RMQ[maxn];
int mm[maxn];
int best[20][maxn];
void initRMQ(int n)
{
     int i,j,a,b;
     for(i = 1; i <= n; ++i) RMQ[i] = height[i];
     for(mm[0]=-1,i=1;i<=n;i++)
     mm[i]=((i&(i-1))==0)?mm[i-1]+1:mm[i-1];
     for(i=1;i<=n;i++) best[0][i]=i;
     for(i=1;i<=mm[n];i++)
     for(j=1;j<=n+1-(1<<i);j++)
     {
       a=best[i-1][j];
       b=best[i-1][j+(1<<(i-1))];
       if(RMQ[a]<RMQ[b]) best[i][j]=a;
       else best[i][j]=b;
     }
     return;
}
int askRMQ(int a,int b)
{
    int t;
    t=mm[b-a+1];b-=(1<<t)-1;
    a=best[t][a];b=best[t][b];
    return RMQ[a]<RMQ[b]?a:b;
}
int lcp(int a,int b)
{
    int t;
    a=rank[a];b=rank[b];
    if(a>b) {t=a;a=b;b=t;}
    return(height[askRMQ(a+1,b)]);
}

 

 

   

POJ 1743

给点一个字符串,求最长重复子串,这两个子串不能重叠。

二分答案,判断是否存在长度为k的子串是相同的,且不重叠。在判定过程中用到一个很犀利的方法:给height数组分组,使得每组中height值都不小于k。这样判断是否存在就是判断某组中有没有两个sa[i], sa[j]的差值 >= k(也就是长度 >= k);

View Code

//#pragma comment(linker,"/STACK:327680000,327680000")
#include <iostream>
#include <cstdio>
#include <cmath>
#include <vector>
#include <cstring>
#include <algorithm>
#include <string>
#include <set>
#include <functional>
#include <numeric>
#include <sstream>
//#include <stack>
#include <map>
#include <queue>

#define CL(arr, val)    memset(arr, val, sizeof(arr))
#define REP(i, n)       for((i) = 0; (i) < (n); ++(i))
#define FOR(i, l, h)    for((i) = (l); (i) <= (h); ++(i))
#define FORD(i, h, l)   for((i) = (h); (i) >= (l); --(i))
#define L(x)    (x) << 1
#define R(x)    (x) << 1 | 1
#define MID(l, r)   (l + r) >> 1
#define Min(x, y)   (x) < (y) ? (x) : (y)
#define Max(x, y)   (x) < (y) ? (y) : (x)
#define E(x)        (1 << (x))
#define iabs(x)     (x) < 0 ? -(x) : (x)
#define OUT(x)  printf("%I64d\n", x)
#define Read()  freopen("data.in", "r", stdin)
#define Write() freopen("data.out", "w", stdout);

typedef long long LL;
const double eps = 1e-8;
const double PI = acos(-1.0);
const int inf = ~0u>>2;


using namespace std;

const int maxn = 200010;

int wa[maxn], wb[maxn], wv[maxn], WS[maxn];
int r[maxn], sa[maxn], n, k;
int rank[maxn], height[maxn];

int cmp(int *r, int a, int b, int l) {
    return r[a] == r[b]&&r[a+l] == r[b+l];
}

void da(int* r, int* sa, int n, int m) {
    int i, j, p, *x = wa, *y = wb, *t;
    for(i = 0; i < m; ++i)  WS[i] = 0;
    for(i = 0; i < n; ++i)  WS[x[i]=r[i]]++;
    for(i = 1; i < m; ++i)  WS[i] += WS[i-1];
    for(i = n - 1; i >= 0; --i) sa[--WS[x[i]]] = i;
    for(j = 1, p = 1; p < n; j *= 2, m = p) {
        for(p = 0, i = n - j; i < n; ++i)   y[p++] = i;
        for(i = 0; i < n; ++i)  if(sa[i] >= j)  y[p++] = sa[i] - j;
        for(i = 0; i < n; ++i)  wv[i] = x[y[i]];
        for(i = 0; i < m; ++i)  WS[i] = 0;
        for(i = 0; i < n; ++i)  WS[wv[i]]++;
        for(i = 1; i < m; ++i)  WS[i] += WS[i-1];
        for(i = n - 1; i >= 0; --i) sa[--WS[wv[i]]] = y[i];
        for(t = x, x = y, y = t, p = 1, x[sa[0]] = 0, i = 1; i < n; ++i)
            x[sa[i]] = cmp(y, sa[i-1], sa[i], j)?p-1:p++;
    }
    return ;
}

void calheight(int* r, int* sa, int n) {
    int i, j, k = 0;
    for(i = 1; i <= n; ++i) rank[sa[i]] = i;
    for(i = 0; i < n; height[rank[i++]] = k)
    for(k?k--:0, j = sa[rank[i]-1]; r[i+k] == r[j+k]; ++k);
    return ;
}

bool cal(int k) {
    int i, mx, mi;
    for(i = 2; i <= n; ++i) {
        if(height[i] < k) {
            mx = -1; mi = inf;
        } else {
            mx = max(mx, max(sa[i], sa[i-1]));
            mi = min(mi, min(sa[i], sa[i-1]));
            if(mx - mi > k) return true;
        }
    }
    return false;
}

int main() {
    //Read();

    int i, x, p;
    while(scanf("%d", &n), n) {
        scanf("%d", &r[0]);
        p = r[0];
        for(i = 1; i < n; ++i) {
            scanf("%d", &x);
            r[i] = x - p + 100;
            p = x;
        }
        da(r, sa, n + 1, 200);
        calheight(r, sa, n);
        int l = 0, r = n>>1, mid;
        while(r - l > 1) {
            mid = MID(l, r);
            if(cal(mid))    l = mid;
            else    r = mid;
        }
        if(l < 4)   puts("0");
        else    printf("%d\n", l + 1);
    }
    return 0;
}

//39 34 30 26 22
//82 78 74 70 66

 

POJ 3261

给一个字符串,求至少出现k次的最长重复子串,这k个子串可以重叠。

还是二分答案,给height分组,不过这里只需要判断一组中是否有>=k个元素。

View Code

//#pragma comment(linker,"/STACK:327680000,327680000")
#include <iostream>
#include <cstdio>
#include <cmath>
#include <vector>
#include <cstring>
#include <algorithm>
#include <string>
#include <set>
#include <functional>
#include <numeric>
#include <sstream>
//#include <stack>
#include <map>
#include <queue>

#define CL(arr, val)    memset(arr, val, sizeof(arr))
#define REP(i, n)       for((i) = 0; (i) < (n); ++(i))
#define FOR(i, l, h)    for((i) = (l); (i) <= (h); ++(i))
#define FORD(i, h, l)   for((i) = (h); (i) >= (l); --(i))
#define L(x)    (x) << 1
#define R(x)    (x) << 1 | 1
#define MID(l, r)   (l + r) >> 1
#define Min(x, y)   (x) < (y) ? (x) : (y)
#define Max(x, y)   (x) < (y) ? (y) : (x)
#define E(x)        (1 << (x))
#define iabs(x)     (x) < 0 ? -(x) : (x)
#define OUT(x)  printf("%I64d\n", x)
#define Read()  freopen("data.in", "r", stdin)
#define Write() freopen("data.out", "w", stdout);

typedef long long LL;
const double eps = 1e-8;
const double PI = acos(-1.0);
const int inf = ~0u>>2;


using namespace std;

const int maxn = 200010;

int wa[maxn], wb[maxn], wv[maxn], WS[maxn];
int r[maxn], sa[maxn], n, K;
int rank[maxn], height[maxn];

int cmp(int *r, int a, int b, int l) {
    return r[a] == r[b]&&r[a+l] == r[b+l];
}

void da(int* r, int* sa, int n, int m) {
    int i, j, p, *x = wa, *y = wb, *t;
    for(i = 0; i < m; ++i)  WS[i] = 0;
    for(i = 0; i < n; ++i)  WS[x[i]=r[i]]++;
    for(i = 1; i < m; ++i)  WS[i] += WS[i-1];
    for(i = n - 1; i >= 0; --i) sa[--WS[x[i]]] = i;
    for(j = 1, p = 1; p < n; j *= 2, m = p) {
        for(p = 0, i = n - j; i < n; ++i)   y[p++] = i;
        for(i = 0; i < n; ++i)  if(sa[i] >= j)  y[p++] = sa[i] - j;
        for(i = 0; i < n; ++i)  wv[i] = x[y[i]];
        for(i = 0; i < m; ++i)  WS[i] = 0;
        for(i = 0; i < n; ++i)  WS[wv[i]]++;
        for(i = 1; i < m; ++i)  WS[i] += WS[i-1];
        for(i = n - 1; i >= 0; --i) sa[--WS[wv[i]]] = y[i];
        for(t = x, x = y, y = t, p = 1, x[sa[0]] = 0, i = 1; i < n; ++i)
            x[sa[i]] = cmp(y, sa[i-1], sa[i], j)?p-1:p++;
    }
    return ;
}

void calheight(int* r, int* sa, int n) {
    int i, j, k = 0;
    for(i = 1; i <= n; ++i) rank[sa[i]] = i;
    for(i = 0; i < n; height[rank[i++]] = k)
    for(k?k--:0, j = sa[rank[i]-1]; r[i+k] == r[j+k]; ++k);
    return ;
}

bool cal(int mid) {
    int i, cnt = 1;
    for(i = 2; i <= n; ++i) {
        if(height[i] < mid) {
            cnt = 1;
        } else {
            cnt++;
            if(cnt >= K)    return true;
        }
    }
    return false;
}

int main() {
    //Read();

    int i ;
    while(~scanf("%d%d", &n, &K)) {
        for(i = 0; i < n; ++i)  scanf("%d", r + i);
        da(r, sa, n + 1, 200);
        calheight(r, sa, n);
        int l = 0, r = n, mid;
        while(r - l > 1) {
            mid = MID(l, r);
            if(cal(mid))    l = mid;
            else    r = mid;
        }
        printf("%d\n", l);
    }
    return 0;
}

 

URAL 1297

给一个字符串,求最长回文串。

现将字符串反转加到原串后面,中间用一个从来没有出现过的字符隔开,求后缀数组;然后枚举每一位i,求以i为中心的回文串的长度。(len为原串的长度,n为加上反转以后字符串的长度)也就是求i(0 <= i < len)位置和n - i - 1位置的最大公共前缀长度(枚举的这个串长度为计数),或者i位置和n - i位置的最大公共前缀长度。

View Code

//#pragma comment(linker,"/STACK:327680000,327680000")
#include <iostream>
#include <cstdio>
#include <cmath>
#include <vector>
#include <cstring>
#include <algorithm>
#include <string>
#include <set>
#include <functional>
#include <numeric>
#include <sstream>
//#include <stack>
#include <map>
#include <queue>

#define CL(arr, val)    memset(arr, val, sizeof(arr))
#define REP(i, n)       for((i) = 0; (i) < (n); ++(i))
#define FOR(i, l, h)    for((i) = (l); (i) <= (h); ++(i))
#define FORD(i, h, l)   for((i) = (h); (i) >= (l); --(i))
#define L(x)    (x) << 1
#define R(x)    (x) << 1 | 1
#define MID(l, r)   (l + r) >> 1
#define Min(x, y)   (x) < (y) ? (x) : (y)
#define Max(x, y)   (x) < (y) ? (y) : (x)
#define E(x)        (1 << (x))
#define iabs(x)     (x) < 0 ? -(x) : (x)
#define OUT(x)  printf("%I64d\n", x)
#define Read()  freopen("data.in", "r", stdin)
#define Write() freopen("data.out", "w", stdout);

typedef long long LL;
const double eps = 1e-8;
const double PI = acos(-1.0);
const int inf = ~0u>>2;


using namespace std;

const int maxn = 2012;

int wa[maxn], wb[maxn], wv[maxn], WS[maxn];
int r[maxn], sa[maxn], n, K;
int Rank[maxn], height[maxn];

int cmp(int *r, int a, int b, int l) {
    return r[a] == r[b]&&r[a+l] == r[b+l];
}

void da(int* r, int* sa, int n, int m) {
    int i, j, p, *x = wa, *y = wb, *t;
    for(i = 0; i < m; ++i)  WS[i] = 0;
    for(i = 0; i < n; ++i)  WS[x[i]=r[i]]++;
    for(i = 1; i < m; ++i)  WS[i] += WS[i-1];
    for(i = n - 1; i >= 0; --i) sa[--WS[x[i]]] = i;
    for(j = 1, p = 1; p < n; j *= 2, m = p) {
        for(p = 0, i = n - j; i < n; ++i)   y[p++] = i;
        for(i = 0; i < n; ++i)  if(sa[i] >= j)  y[p++] = sa[i] - j;
        for(i = 0; i < n; ++i)  wv[i] = x[y[i]];
        for(i = 0; i < m; ++i)  WS[i] = 0;
        for(i = 0; i < n; ++i)  WS[wv[i]]++;
        for(i = 1; i < m; ++i)  WS[i] += WS[i-1];
        for(i = n - 1; i >= 0; --i) sa[--WS[wv[i]]] = y[i];
        for(t = x, x = y, y = t, p = 1, x[sa[0]] = 0, i = 1; i < n; ++i)
            x[sa[i]] = cmp(y, sa[i-1], sa[i], j)?p-1:p++;
    }
    return ;
}

void calheight(int* r, int* sa, int n) {
    int i, j, k = 0;
    for(i = 1; i <= n; ++i) Rank[sa[i]] = i;
    for(i = 0; i < n; height[Rank[i++]] = k)
    for(k?k--:0, j = sa[Rank[i]-1]; r[i+k] == r[j+k]; ++k);
    return ;
}

int RMQ[maxn];
int mm[maxn];
int best[20][maxn];

void initRMQ(int n) {
    int i, j, a, b;
    for(i = 1; i <= n; ++i) RMQ[i] = height[i];
    for(mm[0] = -1, i = 1; i <= n; ++i)
        mm[i] = ((i&(i-1)) == 0) ? mm[i-1] + 1 : mm[i-1];
    for(i = 1; i <= n; ++i) best[0][i] = i;
    for(i = 1; i <= mm[n]; ++i) {
        for(j = 1; j <= n + 1 - (1<<i); ++j) {
            a = best[i-1][j];
            b = best[i-1][j+(1<<(i-1))];
            if(RMQ[a] < RMQ[b]) best[i][j] = a;
            else    best[i][j] = b;
        }
    }
    return ;
}

int askRMQ(int a, int b) {
    int t;
    t = mm[b-a+1]; b -= (1<<t) - 1;
    a = best[t][a]; b = best[t][b];
    return RMQ[a] < RMQ[b] ? a : b;
}

int lcp(int a, int b) {
    a = Rank[a]; b = Rank[b];
    if(a > b)   swap(a, b);
    return height[askRMQ(a + 1, b)];
}

char ss[maxn];

int main() {
    //Read();
    while(~scanf("%s", ss)) {
        int len, i;
        len = strlen(ss);
        n = 0;
        for(i = 0; i < len; ++i)    r[n++] = ss[i];
        r[n++] = '$';
        for(i = len - 1; i >= 0; --i)    r[n++] = ss[i];
        r[n] = 0;
        /*
        for(i = 0; i < n; ++i)  printf("%c", r[i]);
        cout << endl;
        printf("%d\n", n);
        */
        da(r, sa, n + 1, 129);
        calheight(r, sa, n);
        initRMQ(n);
        int tmp, p = 0, ans = 0;
        for(i = 0; i < len; ++i) {
            tmp = lcp(i, n - i - 1);
            if((tmp<<1) - 1 > ans) {
                ans = (tmp<<1) - 1;
                p = i - tmp + 1;
            }
            tmp = lcp(i, n - i);
            if((tmp<<1) > ans) {
                ans = (tmp<<1);
                p = i - tmp;
            }
        }
        for(i = p; i < ans + p; ++i) {
            printf("%c", r[i]);
        }
        puts("");
    }
    return 0;
}

 

POJ  2406

给一个字符串L,一直这个字符串是由某个字符串S重复R次得到的,求最大的R。

枚举S的长度k,判断Suffix(0)和Suffix(k)的最长公共前缀是否为n - k。(n%k == 0);

找Suffix(k) 和 Suffix(0)的最长公共前缀时可以预处理出每个位置到Rank[0]位置的最小height值。

ps:这题只能dc3过,倍增TLE,T_T

详见代码:

View Code

//#pragma comment(linker,"/STACK:32768

抱歉!评论已关闭.