Funktionale Programmierung: Extrahieren des Inhalts einer Seite

module Main
where
 
import Text.XML.HXT.Arrow
import System.Environment
 
main :: IO ()
main
    = do
      [src, dst, showTree] <- getArgs
      runX ( readDocument
               [ (a_parse_html, v_1)
               , (a_issue_warnings, v_0)
               ] src
             >>>
             processChildren
               ( processDocumentRootElement
                 `when`
                 isElem
               )
             >>>
             writeDocument
               [ (a_indent, v_1)
               , (a_show_tree, showTree)
               ] dst
           )
      return ()
processDocumentRootElement      :: ArrowXml a => a XmlTree XmlTree
processDocumentRootElement
    = selem "the-plain-text"
      [ hasName "html"
        >>>
        getChildren
        >>>
        hasName "body"
        >>>
        getChildren
        >>>
        deep ( hasName "div"
               >>>
               hasAttrValue "id" (== "content")
             )
        >>>
        getChildren
        >>>
        deep ( hasName "td"
               >>>
               hasAttrValue "valign" (== "top")
               >>>
               hasAttrValue "width" (== "75%")
             )
        >>>
        getChildren
        >>>
        deep isText
      ]

ghc -e ":main haskell.org - 0" selectAllBodyText.hs

ghc -e ":main haskell.org - 1" selectAllBodyText.hs

Extrahieren des Inhalts einer Seite

selectAllBodyText

1. Testlauf

2. Testlauf