Softwaredesign: Datenmodell

module Index
where
import Data.Char
import Data.List(isPrefixOf)
import           Data.Map(Map)
import qualified Data.Map as M
import           Data.Set(Set)
import qualified Data.Set as S
-- ------------------------------------------------------------
--
-- data types
type Index              = Map Word Occurences
type Occurences         = Set Occurence
type Occurence          = (DocumentName, Position)
type Document           = (DocumentName, WordList)
type WordList           = [Word]
type DocumentNames      = Set DocumentName
-- alias names
type Word               = String
type DocumentName       = String
type Position           = Int
-- ------------------------------------------------------------
--
-- new index
emptyIndex      :: Index
emptyIndex      = M.empty
-- ------------------------------------------------------------
--
-- basic functions
indexDocument   ::                   Document -> Index -> Index
indexDocument'  :: (Word -> Bool) -> Document -> Index -> Index
deleteDocument  ::               DocumentName -> Index -> Index
-- don't filter any words, all are of interest
indexDocument   = indexDocument' (const True)
-- index a document, but filter uninteresting words
indexDocument' isOfInterest (n, wl) ix
    = foldr (\ (w', p') ix' -> indexWord w' n p' ix') ix wl2
    where
    wl1 = map normalize wl
    wl2 = filter (isOfInterest . fst) . zip wl1 $ [0..]
indexWord       :: Word -> DocumentName -> Position -> Index -> Index
indexWord w n p
    = M.insertWith S.union w (S.singleton (n,p))
deleteDocument n ix
    = M.filter (not . S.null)
      .
      M.map delDoc $ ix
    where
    delDoc = S.filter ((/= n) . fst)
searchIndex     :: Word -> Index -> Occurences
searchIndex w
    = M.findWithDefault S.empty (normalize w)
prefixSearchIndex       :: Word -> Index -> Occurences
prefixSearchIndex w
    = M.fold S.union S.empty
      .
      M.filterWithKey (\ k a -> isPrefixOf (normalize w) k)
-- projection: forget the positions within a document
documentNames   :: Occurences -> DocumentNames
documentNames
    = S.fold (\ (n, p) ds -> S.insert n ds) S.empty
-- ------------------------------------------------------------
--
-- word processing
normalize       :: Word -> Word
normalize
    = map toLower
      .
      concatMap remUmlaut
    where 
    remUmlaut '\196'    = "Ae"
    remUmlaut '\214'    = "Oe"
    remUmlaut '\220'    = "Ue"
    remUmlaut '\228'    = "ae"
    remUmlaut '\246'    = "oe"
    remUmlaut '\252'    = "ue"
    remUmlaut '\223'    = "ss"
    remUmlaut c         = [c]
scanText        :: String -> WordList
scanText        = words . map ( \ c -> if isAlphaNum c then c else ' ')
-- ------------------------------------------------------------
showIx  :: Index -> IO ()
showIx ix
    = putStrLn $ concatMap (\ (k,a) -> show k ++ "\t" ++ showPos a ++ "\n") . M.toAscList $ ix
    where
    showPos
        = show . S.toAscList

module IndexTest
where
import Index
-- ------------------------------------------------------------
--
-- example documents
doc1 = ( "Sesamstrasse"
       , scanText "Wieso, weshalb, warum? Wer nicht fragt, bleibt dumm!"
       )
doc2 = ( "Kohl"
       , scanText "Wichtig ist, was am Ende hinten rauskommt."
       )
doc3 = ( "Moeller"
       , scanText "Egal, ob Mailand oder Madrid, Hauptsache Italien! Italien, oder doch Spanien?"
       )
doc4 = ( "Wurst"
       , scanText "Alles hat ein Ende, nur die Wurst hat zwei."
       )
-- ------------------------------------------------------------
--
-- example index
ix0     = emptyIndex
ix1     = indexDocument doc1 ix0
ix12    = indexDocument doc2 ix1
ix123   = indexDocument doc3 ix12
ix1234  = indexDocument doc4 ix123
ix134   = deleteDocument (fst doc2) ix1234
ix34    = deleteDocument (fst doc1) ix134
s1      = showIx ix1
s12     = showIx ix12
s1234   = showIx ix1234
s34     = showIx ix34
-- ------------------------------------------------------------

Datenmodell

Das Datenmodell für einen einfachen Wort-Index als Abstrakte Syntax in Haskell Notation

Ein einfaches Testprogramm