Funktionale Programmierung: Extrahieren des Inhalts einer Seite (kürzer) |
|
1module Main
2where
3
4import Text.XML.HXT.Arrow
5import System.Environment
6
7main :: IO ()
8main
9 = do
10 [src, dst, showTree] <- getArgs
11 runX ( readDocument
12 [ (a_parse_html, v_1)
13 , (a_issue_warnings, v_0)
14 ] src
15 >>>
16 processChildren
17 ( processDocumentRootElement
18 `when`
19 isElem
20 )
21 >>>
22 writeDocument
23 [ (a_indent, v_1)
24 , (a_show_tree, showTree)
25 ] dst
26 )
27 return ()
28
29processDocumentRootElement :: ArrowXml a => a XmlTree XmlTree
30processDocumentRootElement
31 = selem "the-plain-text"
32 [ hasName "html"
33 />
34 hasName "body"
35 //>
36 ( hasName "div"
37 >>>
38 hasAttrValue "id" (== "content")
39 )
40 //>
41 ( hasName "td"
42 >>>
43 hasAttrValue "valign" (== "top")
44 >>>
45 hasAttrValue "width" (== "75%")
46 )
47 //>
48 isText
49 ]
|
ghc -e ":main haskell.org - 0" selectAllBodyText1.hs
|
ghc -e ":main haskell.org - 1" selectAllBodyText1.hs
|
Letzte Änderung: 27.03.2015 | © Prof. Dr. Uwe Schmidt |