Funktionale Programmierung: Extrahieren des Inhalts einer Seite mit XPath |
1module Main
2where
3
4import Text.XML.HXT.Arrow
5import Text.XML.HXT.XPath.Arrows
6
7import System.Environment
8
9main :: IO ()
10main
11 = do
12 [src, dst, showTree] <- getArgs
13 runX ( readDocument
14 [ (a_parse_html, v_1)
15 , (a_issue_warnings, v_0)
16 ] src
17 >>>
18 processChildren
19 ( processDocumentRootElement
20 `when`
21 isElem
22 )
23 >>>
24 writeDocument
25 [ (a_indent, v_1)
26 , (a_show_tree, showTree)
27 ] dst
28 )
29 return ()
30
31processDocumentRootElement :: ArrowXml a => a XmlTree XmlTree
32processDocumentRootElement
33 = selem "the-plain-text"
34 [ getXPathTrees $
35 "/html/body//div[@id=\"content\"]" ++
36 "//td[@valign=\"top\" and @width=\"75%\"]//text()"
37 ]
|
ghc -e ":main haskell.org - 0" selectAllBodyTextXPath.hs
|
Letzte Änderung: 27.03.2015 | © Prof. Dr. Uwe Schmidt |