| 1 | -- Alfred-Margaret: Fast Aho-Corasick string searching |
| 2 | -- Copyright 2022 Channable |
| 3 | -- |
| 4 | -- Licensed under the 3-clause BSD license, see the LICENSE file in the |
| 5 | -- repository root. |
| 6 | |
| 7 | {-# LANGUAGE BangPatterns #-} |
| 8 | {-# LANGUAGE CPP #-} |
| 9 | {-# LANGUAGE DeriveGeneric #-} |
| 10 | {-# LANGUAGE FlexibleInstances #-} |
| 11 | {-# LANGUAGE OverloadedStrings #-} |
| 12 | {-# LANGUAGE TupleSections #-} |
| 13 | |
| 14 | module Data.Text.AhoCorasick.Searcher |
| 15 | ( Searcher |
| 16 | , automaton |
| 17 | , build |
| 18 | , buildNeedleIdSearcher |
| 19 | , buildWithValues |
| 20 | , caseSensitivity |
| 21 | , containsAll |
| 22 | , containsAny |
| 23 | , mapSearcher |
| 24 | , needles |
| 25 | , numNeedles |
| 26 | , setCaseSensitivity |
| 27 | ) where |
| 28 | |
| 29 | import Control.DeepSeq (NFData) |
| 30 | import Data.Bifunctor (second) |
| 31 | import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed) |
| 32 | import GHC.Generics (Generic) |
| 33 | |
| 34 | #if defined(HAS_AESON) |
| 35 | import Data.Aeson ((.:), (.=)) |
| 36 | import qualified Data.Aeson as AE |
| 37 | #endif |
| 38 | |
| 39 | import qualified Data.IntSet as IS |
| 40 | |
| 41 | import Data.Text.CaseSensitivity (CaseSensitivity (..)) |
| 42 | import Data.Text.Utf8 (Text) |
| 43 | |
| 44 | import qualified Data.Text.AhoCorasick.Automaton as Aho |
| 45 | |
| 46 | -- | A set of needles with associated values, and an Aho-Corasick automaton to |
| 47 | -- efficiently find those needles. |
| 48 | -- |
| 49 | -- INVARIANT: searcherAutomaton = Aho.build . searcherNeedles |
| 50 | -- To enforce this invariant, the fields are not exposed from this module. |
| 51 | -- There is a separate constructor function. |
| 52 | -- |
| 53 | -- The purpose of this wrapper is to have a type that is Hashable and Eq, so we |
| 54 | -- can derive those for types that embed the searcher, whithout requiring the |
| 55 | -- automaton itself to be Hashable or Eq, which would be both wasteful and |
| 56 | -- tedious. Because the automaton is fully determined by the needles and |
| 57 | -- associated values, it is sufficient to implement Eq and Hashable in terms of |
| 58 | -- the needles only. |
| 59 | -- |
| 60 | -- We also use Hashed to cache the hash of the needles. |
| 61 | data Searcher v = Searcher |
| 62 | { searcherCaseSensitive :: CaseSensitivity |
| 63 | , searcherNeedles :: Hashed [(Text, v)] |
| 64 | , searcherNumNeedles :: Int |
| 65 | , searcherAutomaton :: Aho.AcMachine v |
| 66 | } deriving (Generic) |
| 67 | |
| 68 | #if defined(HAS_AESON) |
| 69 | instance AE.ToJSON v => AE.ToJSON (Searcher v) where |
| 70 | toJSON s = AE.object |
| 71 | [ "needles" .= needles s |
| 72 | , "caseSensitivity" .= caseSensitivity s |
| 73 | ] |
| 74 | |
| 75 | instance (Hashable v, AE.FromJSON v) => AE.FromJSON (Searcher v) where |
| 76 | parseJSON = AE.withObject "Searcher" $ \o -> buildWithValues <$> o .: "caseSensitivity" <*> o .: "needles" |
| 77 | #endif |
| 78 | |
| 79 | instance Show (Searcher v) where |
| 80 | show _ = "Searcher _ _ _" |
| 81 | |
| 82 | instance Hashable v => Hashable (Searcher v) where |
| 83 | hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher |
| 84 | {-# INLINE hashWithSalt #-} |
| 85 | |
| 86 | instance Eq v => Eq (Searcher v) where |
| 87 | -- Since we store the length of the needle list anyway, |
| 88 | -- we can use it to early out if there is a length mismatch. |
| 89 | Searcher cx xs nx _ == Searcher cy ys ny _ = (nx, xs, cx) == (ny, ys, cy) |
| 90 | {-# INLINE (==) #-} |
| 91 | |
| 92 | instance NFData v => NFData (Searcher v) |
| 93 | |
| 94 | -- NOTE: Although we could implement Semigroup for every v by just concatenating |
| 95 | -- needle lists, we don't, because this might lead to unexpected results. For |
| 96 | -- example, if v is (Int, a) where the Int is a priority, combining two |
| 97 | -- searchers might want to discard priorities, concatenate the needle lists, and |
| 98 | -- reassign priorities, rather than concatenating the needle lists as-is and |
| 99 | -- possibly having duplicate priorities in the resulting searcher. |
| 100 | instance Semigroup (Searcher ()) where |
| 101 | x <> y |
| 102 | | caseSensitivity x == caseSensitivity y |
| 103 | = buildWithValues (searcherCaseSensitive x) (needles x <> needles y) |
| 104 | | otherwise = error "Combining searchers of different case sensitivity" |
| 105 | {-# INLINE (<>) #-} |
| 106 | |
| 107 | -- | Builds the Searcher for a list of needles |
| 108 | -- The caller is responsible that the needles are lower case in case the IgnoreCase |
| 109 | -- is used for case sensitivity |
| 110 | build :: CaseSensitivity -> [Text] -> Searcher () |
| 111 | build case_ = buildWithValues case_ . fmap (, ()) |
| 112 | |
| 113 | -- | The caller is responsible that the needles are lower case in case the IgnoreCase |
| 114 | -- is used for case sensitivity |
| 115 | buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v |
| 116 | {-# INLINABLE buildWithValues #-} |
| 117 | buildWithValues case_ ns = |
| 118 | Searcher case_ (hashed ns) (length ns) $ Aho.build ns |
| 119 | |
| 120 | -- | Modify the values associated with the needles. |
| 121 | mapSearcher :: Hashable b => (a -> b) -> Searcher a -> Searcher b |
| 122 | mapSearcher f searcher = searcher |
| 123 | { searcherNeedles = hashed $ fmap (second f) $ needles searcher |
| 124 | , searcherAutomaton = fmap f (searcherAutomaton searcher) |
| 125 | } |
| 126 | |
| 127 | needles :: Searcher v -> [(Text, v)] |
| 128 | needles = unhashed . searcherNeedles |
| 129 | |
| 130 | numNeedles :: Searcher v -> Int |
| 131 | numNeedles = searcherNumNeedles |
| 132 | |
| 133 | automaton :: Searcher v -> Aho.AcMachine v |
| 134 | automaton = searcherAutomaton |
| 135 | |
| 136 | caseSensitivity :: Searcher v -> CaseSensitivity |
| 137 | caseSensitivity = searcherCaseSensitive |
| 138 | |
| 139 | -- | Updates the case sensitivity of the searcher. Does not change the |
| 140 | -- capitilization of the needles. The caller should be certain that if IgnoreCase |
| 141 | -- is passed, the needles are already lower case. |
| 142 | setCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v |
| 143 | setCaseSensitivity case_ searcher = searcher{ |
| 144 | searcherCaseSensitive = case_ |
| 145 | } |
| 146 | |
| 147 | -- | Return whether the haystack contains any of the needles. |
| 148 | -- Case sensitivity depends on the properties of the searcher |
| 149 | -- This function is marked noinline as an inlining boundary. Aho.runText is |
| 150 | -- marked inline, so this function will be optimized to report only whether |
| 151 | -- there is a match, and not construct a list of matches. We don't want this |
| 152 | -- function be inline, to make sure that the conditions of the caller don't |
| 153 | -- affect how this function is optimized. There is little to gain from |
| 154 | -- additional inlining. The pragma is not an optimization in itself, rather it |
| 155 | -- is a defence against fragile optimizer decisions. |
| 156 | {-# NOINLINE containsAny #-} |
| 157 | containsAny :: Searcher () -> Text -> Bool |
| 158 | containsAny !searcher !text = |
| 159 | let |
| 160 | -- On the first match, return True immediately. |
| 161 | f _acc _match = Aho.Done True |
| 162 | in case caseSensitivity searcher of |
| 163 | CaseSensitive -> Aho.runText False f (automaton searcher) text |
| 164 | IgnoreCase -> Aho.runLower False f (automaton searcher) text |
| 165 | |
| 166 | -- | Build a 'Searcher' that returns the needle's index in the needle list when it matches. |
| 167 | buildNeedleIdSearcher :: CaseSensitivity -> [Text] -> Searcher Int |
| 168 | buildNeedleIdSearcher !case_ !ns = |
| 169 | buildWithValues case_ $ zip ns [0..] |
| 170 | |
| 171 | -- | Returns whether the haystack contains all of the needles. |
| 172 | -- This function expects the passed 'Searcher' to be constructed using 'buildNeedleIdAutomaton'. |
| 173 | containsAll :: Searcher Int -> Text -> Bool |
| 174 | containsAll !searcher !haystack = |
| 175 | let |
| 176 | initial = IS.fromDistinctAscList [0..numNeedles searcher - 1] |
| 177 | ac = automaton searcher |
| 178 | |
| 179 | f !acc (Aho.Match _index !needleId) |
| 180 | | IS.null acc' = Aho.Done acc' |
| 181 | | otherwise = Aho.Step acc' |
| 182 | where |
| 183 | !acc' = IS.delete needleId acc |
| 184 | |
| 185 | in IS.null $ case caseSensitivity searcher of |
| 186 | CaseSensitive -> Aho.runText initial f ac haystack |
| 187 | IgnoreCase -> Aho.runLower initial f ac haystack |