DNA Sequence

题目描述

It’s well known that DNA Sequence is a sequence only contains A, C, T and G, and it’s very useful to analyze a segment of DNA Sequence，For example, if a animal’s DNA sequence contains segment ATC then it may mean that the animal may have a genetic disease. Until now scientists have found several those segments, the problem is how many kinds of DNA sequences of a species don’t contain those segments.

Suppose that DNA sequences of a species is a sequence that consist of A, C, T and G, and the length of sequences is a given integer $n$.

题意概述

给定$m$个长度不超过$10$的 DNA 片段，问所有长度为$n$的 DNA 序列中有几个不包含所有给定的 DNA 片段。

数据范围：$0 \le m \le 10, \ 1 \le n \le 2 \times 10^9$。

算法分析

对$m$个 DNA 片段建立 AC 自动机。令$f_{i, j}$表示长度为$i$的 DNA 序列在自动机上跑到节点$j$的方案数。转移可以用一个矩阵来表示，因此可以用矩阵快速幂来加速转移。

代码

#include <cstdio>
#include <cstring>

static const int N = 11;
static const int POOL = 105;
char s[N];

int get_id(char c) {
  return c == 'A' ? 0 : c == 'C' ? 1 : c == 'T' ? 2 : 3;
}

typedef int Array[POOL][POOL];

int numn = 1, que[POOL];
Array mp, a, tmp;
struct ACAutomaton {
  int nxt[4], fail;
  bool match;
} node[POOL];

void insert(char s[]) {
  int root = 1, len = strlen(s);
  for (int i = 0; i < len; ++ i) {
    int p = get_id(s[i]);
    if (! node[root].nxt[p]) node[root].nxt[p] = ++ numn;
    root = node[root].nxt[p];
  }
  node[root].match = true;
}

void build() {
  int qb = 0, qe = 1; que[0] = 1;
  while (qb < qe) {
    int u = que[qb ++];
    for (int i = 0; i < 4; ++ i)
      if (node[u].nxt[i]) {
        node[node[u].nxt[i]].fail = 1;
        if (u > 1) {
          int root = node[u].fail;
          while (root > 1 && ! node[root].nxt[i]) root = node[root].fail;
          if (node[root].nxt[i]) {
            node[node[u].nxt[i]].fail = node[root].nxt[i];
            if (node[node[root].nxt[i]].match) node[node[u].nxt[i]].match = true;
          }
        }
        que[qe ++] = node[u].nxt[i];
      }
  }
}

void mul(Array a, Array b) {
  memset(tmp, 0, sizeof tmp);
  for (int i = 1; i <= numn; ++ i)
    for (int j = 1; j <= numn; ++ j)
      if (a[i][j]) for (int k = 1; k <= numn; ++ k) (tmp[i][k] += 1ll * a[i][j] * b[j][k] % 100000) %= 100000;
  memcpy(a, tmp, sizeof tmp);
}

void power(Array a, int b) {
  while (b) {
    if (b & 1) mul(a, mp);
    mul(mp, mp), b >>= 1;
  }
}

int main() {
  int m, n; scanf("%d%d", &m, &n);
  for (int i = 0; i < m; ++ i) scanf(" %s", s), insert(s);
  build();
  for (int i = 1; i <= numn; ++ i) if (! node[i].match)
    for (int j = 0; j < 4; ++ j) {
      int root = i;
      while (root > 1 && ! node[root].nxt[j]) root = node[root].fail;
      if (node[root].nxt[j]) ++ mp[i][node[root].nxt[j]]; else ++ mp[i][root];
    }
  for (int i = 1; i <= numn; ++ i) a[i][i] = 1;
  power(a, n);
  int ans = 0;
  for (int i = 1; i <= numn; ++ i) if (! node[i].match) (ans += a[1][i]) %= 100000;
  printf("%d\n", ans);
  return 0;
}

OI > Common Skill

#Aho-Corasick Automaton #Exponentiation by Squaring

DNA Sequence

https://regmsif.cf/2018/01/15/oi/dna-sequence/

作者

RegMs If

发布于

2018年1月15日

许可协议

Repairing Company 上一篇

Picnic Planning 下一篇