Funktionale Programmierung: Extrahieren des Inhalts einer Seite (kürzer)

module Main
where
 
import Text.XML.HXT.Arrow
import System.Environment
 
main :: IO ()
main
    = do
      [src, dst, showTree] <- getArgs
      runX ( readDocument
               [ (a_parse_html, v_1)
               , (a_issue_warnings, v_0)
               ] src
             >>>
             processChildren
               ( processDocumentRootElement
                 `when`
                 isElem
               )
             >>>
             writeDocument
               [ (a_indent, v_1)
               , (a_show_tree, showTree)
               ] dst
           )
      return ()
processDocumentRootElement      :: ArrowXml a => a XmlTree XmlTree
processDocumentRootElement
    = selem "the-plain-text"
      [ hasName "html"
        />
        hasName "body"
        //>
        ( hasName "div"
          >>>
          hasAttrValue "id" (== "content")
        )
        //>
        ( hasName "td"
          >>>
          hasAttrValue "valign" (== "top")
          >>>
          hasAttrValue "width" (== "75%")
        )
        //>
        isText
      ]

ghc -e ":main haskell.org - 0" selectAllBodyText1.hs

ghc -e ":main haskell.org - 1" selectAllBodyText1.hs

Extrahieren des Inhalts einer Seite (kürzer)

selectAllBodyText1

1. Testlauf

2. Testlauf