Softwaredesign: Datenmodell |
1module Index
2where
3
4import Data.Char
5import Data.List(isPrefixOf)
6
7import Data.Map(Map)
8import qualified Data.Map as M
9import Data.Set(Set)
10import qualified Data.Set as S
11
12-- ------------------------------------------------------------
13--
14-- data types
15
16type Index = Map Word Occurences
17
18type Occurences = Set Occurence
19
20type Occurence = (DocumentName, Position)
21
22type Document = (DocumentName, WordList)
23
24type WordList = [Word]
25
26type DocumentNames = Set DocumentName
27
28-- alias names
29
30type Word = String
31type DocumentName = String
32type Position = Int
33
34-- ------------------------------------------------------------
35--
36-- new index
37
38emptyIndex :: Index
39emptyIndex = M.empty
40
41-- ------------------------------------------------------------
42--
43-- basic functions
44
45indexDocument :: Document -> Index -> Index
46indexDocument' :: (Word -> Bool) -> Document -> Index -> Index
47
48deleteDocument :: DocumentName -> Index -> Index
49
50-- don't filter any words, all are of interest
51
52indexDocument = indexDocument' (const True)
53
54-- index a document, but filter uninteresting words
55
56indexDocument' isOfInterest (n, wl) ix
57 = foldr (\ (w', p') ix' -> indexWord w' n p' ix') ix wl2
58 where
59 wl1 = map normalize wl
60 wl2 = filter (isOfInterest . fst) . zip wl1 $ [0..]
61
62indexWord :: Word -> DocumentName -> Position -> Index -> Index
63indexWord w n p
64 = M.insertWith S.union w (S.singleton (n,p))
65
66deleteDocument n ix
67 = M.filter (not . S.null)
68 .
69 M.map delDoc $ ix
70 where
71 delDoc = S.filter ((/= n) . fst)
72
73searchIndex :: Word -> Index -> Occurences
74searchIndex w
75 = M.findWithDefault S.empty (normalize w)
76
77prefixSearchIndex :: Word -> Index -> Occurences
78prefixSearchIndex w
79 = M.fold S.union S.empty
80 .
81 M.filterWithKey (\ k a -> isPrefixOf (normalize w) k)
82
83-- projection: forget the positions within a document
84
85documentNames :: Occurences -> DocumentNames
86documentNames
87 = S.fold (\ (n, p) ds -> S.insert n ds) S.empty
88
89-- ------------------------------------------------------------
90--
91-- word processing
92
93normalize :: Word -> Word
94
95normalize
96 = map toLower
97 .
98 concatMap remUmlaut
99 where
100 remUmlaut '\196' = "Ae"
101 remUmlaut '\214' = "Oe"
102 remUmlaut '\220' = "Ue"
103 remUmlaut '\228' = "ae"
104 remUmlaut '\246' = "oe"
105 remUmlaut '\252' = "ue"
106 remUmlaut '\223' = "ss"
107 remUmlaut c = [c]
108
109scanText :: String -> WordList
110scanText = words . map ( \ c -> if isAlphaNum c then c else ' ')
111
112-- ------------------------------------------------------------
113
114showIx :: Index -> IO ()
115showIx ix
116 = putStrLn $ concatMap (\ (k,a) -> show k ++ "\t" ++ showPos a ++ "\n") . M.toAscList $ ix
117 where
118 showPos
119 = show . S.toAscList
|
1module IndexTest
2where
3
4import Index
5
6-- ------------------------------------------------------------
7--
8-- example documents
9
10doc1 = ( "Sesamstrasse"
11 , scanText "Wieso, weshalb, warum? Wer nicht fragt, bleibt dumm!"
12 )
13
14doc2 = ( "Kohl"
15 , scanText "Wichtig ist, was am Ende hinten rauskommt."
16 )
17
18doc3 = ( "Moeller"
19 , scanText "Egal, ob Mailand oder Madrid, Hauptsache Italien! Italien, oder doch Spanien?"
20 )
21
22doc4 = ( "Wurst"
23 , scanText "Alles hat ein Ende, nur die Wurst hat zwei."
24 )
25
26-- ------------------------------------------------------------
27--
28-- example index
29
30ix0 = emptyIndex
31
32ix1 = indexDocument doc1 ix0
33ix12 = indexDocument doc2 ix1
34ix123 = indexDocument doc3 ix12
35ix1234 = indexDocument doc4 ix123
36ix134 = deleteDocument (fst doc2) ix1234
37ix34 = deleteDocument (fst doc1) ix134
38
39s1 = showIx ix1
40s12 = showIx ix12
41s1234 = showIx ix1234
42s34 = showIx ix34
43
44-- ------------------------------------------------------------
|
Letzte Änderung: 10.07.2012 | © Prof. Dr. Uwe Schmidt |