Compilerbau: HTML-Scanner mit JLex |
für die Analyse von HTML-tags, einfachem Text und Kommentar. Diese Spezifikation arbeitet intern mit drei unabhängigen Automaten, mit einem für einfachen Text, einem für Tags und einem für Kommentare.
Zwischen diesen Automaten (STATES) wird mit
yybegin(STATE) hin- und hergeschaltet. |
public class Token {
public static final int
PlainText = 0,
CharRef = PlainText + 1,
EntityRef = CharRef + 1,
TagStart = EntityRef + 1,
TagEnd = TagStart + 1,
AttrName = TagEnd + 1,
AttrValue = AttrName + 1,
Comment = AttrValue + 1,
DocType = Comment + 1,
Error = DocType + 1;
public static final String [] tokens = {
"PlainText",
"CharRef ",
"EntityRef",
"TagStart ",
"TagEnd ",
"AttrName ",
"AttrValue",
"Comment ",
"DocType ",
"Error "
};
public int token;
public String text;
Token(int token,
String text) {
this.token = token;
this.text = text;
}
public String toString() {
return
tokens[token] + "\t\"" + text + "\"";
}
}
|
import java.io.IOException;
public class HtmlLexicalAnalyser {
public
static
void main(String argv[])
throws java.io.IOException {
HtmlScanner s = new HtmlScanner(System.in);
Token t;
while ((t = s.nextSymbol()) != null)
System.out.println(t);
}
}
%%
%class HtmlScanner
%type Token
%function nextSymbol
%line
%char
%{
private
StringBuffer commentString = new StringBuffer(256);
private
Token mkToken(int token, int start, int end) {
return
new Token(token,
yytext().substring(start,
yytext().length() - end));
}
private
Token mkToken(int token, int start) {
return
mkToken(token, start, 0);
}
private
Token mkToken(int token) {
return
mkToken(token,0,0);
}
%}
%state TAG COMMENT
PLAINTEXT=[^<&]+
COMMENTSTART="<!--"
COMMENTEND="-->"
DOCTYPE="<!DOCTYPE"[^>]*">"
LETTER=[A-Za-z]
DIGIT=[0-9]
EXTRA=[\-:_/]
WHITESPACE=([\ \n\t])
DQ="\""
SQ="'"
NAME=({LETTER}|{DIGIT}|{EXTRA})+
TAGSTART="<"{NAME}
TAGATTR={NAME}
TAGEND=">"
TAGVALUE1="="([^\ \t\n{DQ}{SQ}>])*
TAGVALUE2="="{DQ}[^{DQ}]*{DQ}
TAGVALUE3="="{SQ}[^{SQ}]*{SQ}
CHARREF="&#"{DIGIT}+";"
ENTITYREF="&"{NAME}";"
%%
<YYINITIAL> {CHARREF} {
return
mkToken(Token.CharRef, 2, 1);
}
<YYINITIAL> {ENTITYREF} {
return
mkToken(Token.EntityRef, 1, 1);
}
<YYINITIAL> {PLAINTEXT} {
return
mkToken(Token.PlainText);
}
<YYINITIAL> {COMMENTSTART} {
yybegin(COMMENT);
commentString.setLength(0);
}
<YYINITIAL> {DOCTYPE} {
return
mkToken(Token.DocType, 2 , 1);
}
<YYINITIAL> {TAGSTART} {
yybegin(TAG);
return
mkToken(Token.TagStart,1);
}
<YYINITIAL> . {
return
mkToken(Token.Error);
}
<TAG> {TAGEND} {
yybegin(YYINITIAL);
return
mkToken(Token.TagEnd);
}
<TAG> {TAGATTR} {
return
mkToken(Token.AttrName);
}
<TAG> {TAGVALUE1} {
return
mkToken(Token.AttrValue, 1);
}
<TAG> {TAGVALUE2} {
return
mkToken(Token.AttrValue, 2, 1);
}
<TAG> {TAGVALUE3} {
return
mkToken(Token.AttrValue, 2, 1);
}
<TAG> {WHITESPACE} { }
<TAG> . {
return
mkToken(Token.Error);
}
<COMMENT> {COMMENTEND} {
yybegin(YYINITIAL);
return
new
Token(Token.Comment,
commentString.toString());
}
<COMMENT> . {
commentString.append(yytext());
}
<COMMENT> \n {
commentString.append(yytext());
}
|
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<html>
<head>
<title>HTML-Testseite</title>
</head>
<body bgcolor="#ffffff">
<h1><HTML-Testseite></h1>
<hr width=50>
<address><a href='mailto:uwe@localhost'>Uwe 
sein Benutzer</a></address>
<!-- Created: Thu Sep 23 09:26:04 CEST 1999 -->
<!-- hhmts start -->
Last modified: Wed Nov 17 11:49:12 CET 2004
<!-- hhmts end -->
</body>
</html> |
DocType "DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN""
PlainText "
"
TagStart "html"
TagEnd ">"
PlainText "
"
TagStart "head"
TagEnd ">"
PlainText "
"
TagStart "title"
TagEnd ">"
PlainText "HTML-Testseite"
TagStart "/title"
TagEnd ">"
PlainText "
"
TagStart "/head"
TagEnd ">"
PlainText "
"
TagStart "body"
AttrName "bgcolor"
AttrValue "#ffffff"
TagEnd ">"
PlainText "
"
TagStart "h1"
TagEnd ">"
EntityRef "lt"
PlainText "HTML-Testseite"
EntityRef "gt"
TagStart "/h1"
TagEnd ">"
PlainText "
"
TagStart "hr"
AttrName "width"
AttrValue "50"
TagEnd ">"
PlainText "
"
TagStart "address"
TagEnd ">"
TagStart "a"
AttrName "href"
AttrValue "mailto:uwe@localhost"
TagEnd ">"
PlainText "Uwe"
CharRef "32"
PlainText "
sein Benutzer"
TagStart "/a"
TagEnd ">"
TagStart "/address"
TagEnd ">"
PlainText "
"
Comment " Created: Thu Sep 23 09:26:04 CEST 1999 "
PlainText "
"
Comment " hhmts start "
PlainText "
Last modified: Wed Nov 17 11:49:12 CET 2004
"
Comment " hhmts end "
PlainText "
"
TagStart "/body"
TagEnd ">"
PlainText "
"
TagStart "/html"
TagEnd ">"
PlainText "
"
|
JCP := $(PWD)/../../../software:.:$(CLASSPATH)
JAVA := java -classpath $(JCP)
JAVAC := javac -classpath $(JCP)
JLEX := java -classpath $(JCP) JLex.Main
lexsrc := $(shell echo *.lex)
javagen := $(lexsrc:.lex=.java)
javasrc := $(shell echo *.java)
classes := $(lexsrc:.lex=.class) \
$(javasrc:.java=.class)
output := Test.html.tokens
all : examples
examples : $(javagen) $(classes) $(output)
HtmlLexicalAnalyser.java : HtmlLexicalAnalyser.lex Token.java
Test.html.tokens : Test.html $(classes)
$(JAVA) HtmlLexicalAnalyser \
< Test.html \
> Test.html.tokens
%.java : %.lex
$(JLEX) $<
mv -f $<.java $*.java
%.class : %.java
$(JAVAC) $<
.SUFFIXES : .class .java .lex
clean :
rm -f $(javagen) \
$(classes) \
$(output) \
*~ .*~
|
Letzte Änderung: 14.02.2012 | © Prof. Dr. Uwe Schmidt |