1 module miao..string.dfa;
2 
3 /*!
4 Main features
5     builds the minimal deterministic automaton recognizing the language Sigma*x;
6     extra space in O(msigma) if the automaton is stored in a direct access table;
7     preprocessing phase in O(msigma) time complexity;
8     searching phase in O(n) time complexity if the automaton is stored in a direct access table, O(nlog(sigma)) otherwise.
9     
10 Description
11     Searching a word x with an automaton consists first in building the minimal Deterministic Finite Automaton (DFA)  A(x) recognizing the language Sigma*x.
12     The DFA  A(x) =(Q, q0, T, E) recognizing the language Sigma*x is defined as follows:
13         Q is the set of all the prefixes of x: Q={epsilon, x[0], x[0 .. 1], ... , x[0 .. m-2], x};
14         q0=epsilon;
15         T={x};
16         for q in Q (q is a prefix of x) and a in Sigma, (q, a, qa) is in E if and only if qa is also a prefix of x, otherwise (q, a, p) is in E such that p is the longest suffix of qa which is a prefix of x.
17     The DFA  A(x) can be constructed in O(m+sigma) time and O(msigma) space.
18     Once the DFA  A(x) is build, searching for a word x in a text y consists in parsing the text y with the DFA  A(x) beginning with the initial state q0. Each time the terminal state is encountered an occurrence of x is reported.
19     The searching phase can be performed in O(n) time if the automaton is stored in a direct access table, in O(nlog(sigma)) otherwise.
20 */
21 
22 @trusted:
23 
24 import miao.common.check;
25 import miao.common.util;
26 
27 //! \fix    generalized to all kinds of charset
28 struct Dfa_searcher(PatRange, CorpusRange = PatRange) 
29     if (isValidParam!(PatRange, CorpusRange)) {
30 public:
31     this(in PatRange pattern)
32     {
33         pat_len_ = pattern.length;
34         if (pat_len_ > 0) {
35             terminal_ = pat_len_;
36             dfa_ = build_dfa_(pattern);
37         }
38     }
39     
40     int search(in CorpusRange corpus) const
41 	out(result) {
42 		assert(result == -1 || (0 <= result && result < corpus.length));
43 	}
44 	body {
45 		if (corpus.length == 0 || pat_len_ == 0) return -1;
46 		if (corpus.length < pat_len_) return -1;
47 		
48 		return search_(corpus);
49 	}
50     
51 private:
52     enum ascii_size = cast(uint)0xff;
53     enum init_state = 0;
54     
55     int[][] build_dfa_(in PatRange pattern) const
56     in {
57         assert(pat_len_ > 0);
58     }
59     body {
60         auto dfa = new int[][pat_len_ + 1];
61         
62         auto state = init_state;
63         
64         for (auto i = 0; i < pat_len_; ++i) {
65             const target = i + 1;
66             const event = pattern[i];
67 
68             if (dfa[state] == null) {
69                 dfa[state] = new int[ascii_size];
70             }
71 
72             assert(dfa[target] == null);
73 
74             auto old_target = dfa[state][event];
75             
76             dfa[state][event] = target;
77             dfa[target] = new int[ascii_size];
78             dfa[target][] = dfa[old_target][];
79             
80             state = target;
81         }
82         
83         assert(state == terminal_);
84 
85         return dfa;
86     }
87   
88     int search_(inout const CorpusRange corpus) const
89     {
90         for (auto state = init_state, cursor = 0; cursor < corpus.length; ++cursor) {
91             const event = corpus[cursor];
92     
93             assert(event < ascii_size, "@Dfa_searcher: only ascii are supported for the moment");
94             
95             if ((state = transit_(state, event)) == terminal_) {
96                 assert(cursor + 1 >= pat_len_);
97                 return cursor - pat_len_ + 1;
98             }
99         }
100         
101         return -1;
102     }
103     
104     int transit_(int state, int event) const
105     in {
106         assert(0 <= state && state < terminal_);
107         assert(0 <= event && event < ascii_size);
108     }
109     body {
110         const table = dfa_[state];
111         return table == null? 0: table[event];
112     }
113     
114 private:
115     immutable int[][] dfa_;
116     immutable int terminal_;
117     immutable int pat_len_;
118 }
119 
120 alias dfa_search = GenerateFunction!Dfa_searcher;
121 
122 unittest {
123 	import miao.string.test;
124 
125     testAll!(Dfa_searcher, dfa_search)();
126 }