1-- Alfred-Margaret: Fast Aho-Corasick string searching
2-- Copyright 2022 Channable
3--
4-- Licensed under the 3-clause BSD license, see the LICENSE file in the
5-- repository root.
6
7{-# LANGUAGE BangPatterns #-}
8{-# LANGUAGE CPP #-}
9{-# LANGUAGE DeriveGeneric #-}
10{-# LANGUAGE FlexibleInstances #-}
11{-# LANGUAGE OverloadedStrings #-}
12{-# LANGUAGE TupleSections #-}
13
14module Data.Text.AhoCorasick.Searcher
15 ( Searcher
16 , automaton
17 , build
18 , buildNeedleIdSearcher
19 , buildWithValues
20 , caseSensitivity
21 , containsAll
22 , containsAny
23 , mapSearcher
24 , needles
25 , numNeedles
26 , setCaseSensitivity
27 ) where
28
29import Control.DeepSeq (NFData)
30import Data.Bifunctor (second)
31import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
32import GHC.Generics (Generic)
33
34#if defined(HAS_AESON)
35import Data.Aeson ((.:), (.=))
36import qualified Data.Aeson as AE
37#endif
38
39import qualified Data.IntSet as IS
40
41import Data.Text.CaseSensitivity (CaseSensitivity (..))
42import Data.Text.Utf8 (Text)
43
44import qualified Data.Text.AhoCorasick.Automaton as Aho
45
46-- | A set of needles with associated values, and an Aho-Corasick automaton to
47-- efficiently find those needles.
48--
49-- INVARIANT: searcherAutomaton = Aho.build . searcherNeedles
50-- To enforce this invariant, the fields are not exposed from this module.
51-- There is a separate constructor function.
52--
53-- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
54-- can derive those for types that embed the searcher, whithout requiring the
55-- automaton itself to be Hashable or Eq, which would be both wasteful and
56-- tedious. Because the automaton is fully determined by the needles and
57-- associated values, it is sufficient to implement Eq and Hashable in terms of
58-- the needles only.
59--
60-- We also use Hashed to cache the hash of the needles.
61data Searcher v = Searcher
62 { searcherCaseSensitive :: CaseSensitivity
63 , searcherNeedles :: Hashed [(Text, v)]
64 , searcherNumNeedles :: Int
65 , searcherAutomaton :: Aho.AcMachine v
66 } deriving (Generic)
67
68#if defined(HAS_AESON)
69instance AE.ToJSON v => AE.ToJSON (Searcher v) where
70 toJSON s = AE.object
71 [ "needles" .= needles s
72 , "caseSensitivity" .= caseSensitivity s
73 ]
74
75instance (Hashable v, AE.FromJSON v) => AE.FromJSON (Searcher v) where
76 parseJSON = AE.withObject "Searcher" $ \o -> buildWithValues <$> o .: "caseSensitivity" <*> o .: "needles"
77#endif
78
79instance Show (Searcher v) where
80 show _ = "Searcher _ _ _"
81
82instance Hashable v => Hashable (Searcher v) where
83 hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher
84 {-# INLINE hashWithSalt #-}
85
86instance Eq v => Eq (Searcher v) where
87 -- Since we store the length of the needle list anyway,
88 -- we can use it to early out if there is a length mismatch.
89 Searcher cx xs nx _ == Searcher cy ys ny _ = (nx, xs, cx) == (ny, ys, cy)
90 {-# INLINE (==) #-}
91
92instance NFData v => NFData (Searcher v)
93
94-- NOTE: Although we could implement Semigroup for every v by just concatenating
95-- needle lists, we don't, because this might lead to unexpected results. For
96-- example, if v is (Int, a) where the Int is a priority, combining two
97-- searchers might want to discard priorities, concatenate the needle lists, and
98-- reassign priorities, rather than concatenating the needle lists as-is and
99-- possibly having duplicate priorities in the resulting searcher.
100instance Semigroup (Searcher ()) where
101 x <> y
102 | caseSensitivity x == caseSensitivity y
103 = buildWithValues (searcherCaseSensitive x) (needles x <> needles y)
104 | otherwise = error "Combining searchers of different case sensitivity"
105 {-# INLINE (<>) #-}
106
107-- | Builds the Searcher for a list of needles
108-- The caller is responsible that the needles are lower case in case the IgnoreCase
109-- is used for case sensitivity
110build :: CaseSensitivity -> [Text] -> Searcher ()
111build case_ = buildWithValues case_ . fmap (, ())
112
113-- | The caller is responsible that the needles are lower case in case the IgnoreCase
114-- is used for case sensitivity
115buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v
116{-# INLINABLE buildWithValues #-}
117buildWithValues case_ ns =
118 Searcher case_ (hashed ns) (length ns) $ Aho.build ns
119
120-- | Modify the values associated with the needles.
121mapSearcher :: Hashable b => (a -> b) -> Searcher a -> Searcher b
122mapSearcher f searcher = searcher
123 { searcherNeedles = hashed $ fmap (second f) $ needles searcher
124 , searcherAutomaton = fmap f (searcherAutomaton searcher)
125 }
126
127needles :: Searcher v -> [(Text, v)]
128needles = unhashed . searcherNeedles
129
130numNeedles :: Searcher v -> Int
131numNeedles = searcherNumNeedles
132
133automaton :: Searcher v -> Aho.AcMachine v
134automaton = searcherAutomaton
135
136caseSensitivity :: Searcher v -> CaseSensitivity
137caseSensitivity = searcherCaseSensitive
138
139-- | Updates the case sensitivity of the searcher. Does not change the
140-- capitilization of the needles. The caller should be certain that if IgnoreCase
141-- is passed, the needles are already lower case.
142setCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v
143setCaseSensitivity case_ searcher = searcher{
144 searcherCaseSensitive = case_
145 }
146
147-- | Return whether the haystack contains any of the needles.
148-- Case sensitivity depends on the properties of the searcher
149-- This function is marked noinline as an inlining boundary. Aho.runText is
150-- marked inline, so this function will be optimized to report only whether
151-- there is a match, and not construct a list of matches. We don't want this
152-- function be inline, to make sure that the conditions of the caller don't
153-- affect how this function is optimized. There is little to gain from
154-- additional inlining. The pragma is not an optimization in itself, rather it
155-- is a defence against fragile optimizer decisions.
156{-# NOINLINE containsAny #-}
157containsAny :: Searcher () -> Text -> Bool
158containsAny !searcher !text =
159 let
160 -- On the first match, return True immediately.
161 f _acc _match = Aho.Done True
162 in case caseSensitivity searcher of
163 CaseSensitive -> Aho.runText False f (automaton searcher) text
164 IgnoreCase -> Aho.runLower False f (automaton searcher) text
165
166-- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.
167buildNeedleIdSearcher :: CaseSensitivity -> [Text] -> Searcher Int
168buildNeedleIdSearcher !case_ !ns =
169 buildWithValues case_ $ zip ns [0..]
170
171-- | Returns whether the haystack contains all of the needles.
172-- This function expects the passed 'Searcher' to be constructed using 'buildNeedleIdAutomaton'.
173containsAll :: Searcher Int -> Text -> Bool
174containsAll !searcher !haystack =
175 let
176 initial = IS.fromDistinctAscList [0..numNeedles searcher - 1]
177 ac = automaton searcher
178
179 f !acc (Aho.Match _index !needleId)
180 | IS.null acc' = Aho.Done acc'
181 | otherwise = Aho.Step acc'
182 where
183 !acc' = IS.delete needleId acc
184
185 in IS.null $ case caseSensitivity searcher of
186 CaseSensitive -> Aho.runText initial f ac haystack
187 IgnoreCase -> Aho.runLower initial f ac haystack