Funktionale Programmierung: Textinhalt einer Seite formatieren |
1module Main
2where
3
4import Text.XML.HXT.Arrow
5import System.Environment
6
7main :: IO ()
8main
9 = do
10 [src, dst, showTree] <- getArgs
11 runX ( readDocument
12 [ (a_parse_html, v_1)
13 , (a_issue_warnings, v_0)
14 ] src
15 >>>
16 processChildren
17 ( processDocumentRootElement
18 `when`
19 isElem
20 )
21 >>>
22 writeDocument
23 [ (a_indent, v_1)
24 , (a_show_tree, showTree)
25 ] dst
26 )
27 return ()
28
29processDocumentRootElement :: ArrowXml a => a XmlTree XmlTree
30processDocumentRootElement
31 = eelem "the-plain-text"
32 += txt "\n"
33 += formatBodyText
34
35formatBodyText :: ArrowXml a => a XmlTree XmlTree
36formatBodyText
37 = xshow getBodyText
38 >>>
39 arr ( words
40 >>>
41 trimLines 8
42 >>>
43 map unwords
44 >>>
45 unlines
46 )
47 >>>
48 mkText
49 where
50 trimLines n [] = []
51 trimLines n xs = take n xs : trimLines n (drop n xs)
52
53getBodyText :: ArrowXml a => a XmlTree XmlTree
54getBodyText
55 = hasName "html"
56 />
57 hasName "body"
58 //>
59 ( hasName "div"
60 >>>
61 hasAttrValue "id" (== "content")
62 )
63 //>
64 ( hasName "td"
65 >>>
66 hasAttrValue "valign" (== "top")
67 >>>
68 hasAttrValue "width" (== "75%")
69 )
70 />
71 deep isText
|
ghc -e ":main haskell.org - 0" selectPureBodyText.hs
|
ghc -e ":main haskell.org - 1" selectPureBodyText.hs
|
Letzte Änderung: 27.03.2015 | © Prof. Dr. Uwe Schmidt |