Funktionale Programmierung: Extrahieren des Inhalts einer Seite mit XPath

module Main
where
 
import Text.XML.HXT.Arrow
import Text.XML.HXT.XPath.Arrows
import System.Environment
 
main :: IO ()
main
    = do
      [src, dst, showTree] <- getArgs
      runX ( readDocument
               [ (a_parse_html, v_1)
               , (a_issue_warnings, v_0)
               ] src
             >>>
             processChildren
               ( processDocumentRootElement
                 `when`
                 isElem
               )
             >>>
             writeDocument
               [ (a_indent, v_1)
               , (a_show_tree, showTree)
               ] dst
           )
      return ()
processDocumentRootElement      :: ArrowXml a => a XmlTree XmlTree
processDocumentRootElement
    = selem "the-plain-text"
      [ getXPathTrees $
        "/html/body//div[@id=\"content\"]" ++
        "//td[@valign=\"top\" and @width=\"75%\"]//text()"
      ]

Extrahieren des Inhalts einer Seite mit XPath

selectAllBodyTextXPath

Testlauf