创建一个(简单的)HTML 解析器。
Oracle 设置:
SET DEFINE OFF;
CREATE OR REPLACE AND COMPILE JAVA SOURCE NAMED HTMLTOTEXT AS
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLToText {
private HTMLToText(){};
private static class TextStripper extends ParserCallback {
private final StringBuffer buffer = new StringBuffer();
private static final HashMap<Tag, String> START_TAGS = new HashMap<Tag, String>();
private static final HashMap<Tag, String> END_TAGS = new HashMap<Tag, String>();
private static final String NEWLINE = "\r\n";
static {
START_TAGS.put(Tag.BR, NEWLINE);
START_TAGS.put(Tag.P, NEWLINE);
START_TAGS.put(Tag.LI, NEWLINE);
START_TAGS.put(Tag.DT, NEWLINE);
START_TAGS.put(Tag.DL, NEWLINE);
START_TAGS.put(Tag.OL, NEWLINE);
START_TAGS.put(Tag.UL, NEWLINE);
START_TAGS.put(Tag.TR, NEWLINE);
START_TAGS.put(Tag.TD, "\t");
START_TAGS.put(Tag.TH, "\t");
END_TAGS.put(Tag.P, NEWLINE);
END_TAGS.put(Tag.LI, NEWLINE);
END_TAGS.put(Tag.DD, NEWLINE);
END_TAGS.put(Tag.DL, NEWLINE);
END_TAGS.put(Tag.OL, NEWLINE);
END_TAGS.put(Tag.UL, NEWLINE);
END_TAGS.put(Tag.TR, NEWLINE);
}
private boolean newline = true;
@Override
public void handleText( final char[] data, final int pos ){
buffer.append(data);
newline = false;
}
@Override
public void handleStartTag( final Tag tag, final MutableAttributeSet attribute, final int pos ){
if ( !newline && START_TAGS.containsKey( tag ) )
{
final String value = START_TAGS.get( tag );
buffer.append( value );
newline = value.equals(NEWLINE);
}
}
@Override
public void handleEndTag( final Tag tag, final int pos ){
if ( !newline && END_TAGS.containsKey( tag ) )
{
final String value = END_TAGS.get( tag );
buffer.append( value );
newline = value.equals(NEWLINE);
}
}
@Override
public void handleSimpleTag( final Tag tag, final MutableAttributeSet attribute, final int pos ){
handleStartTag( tag, attribute, pos );
handleEndTag( tag, pos );
}
@Override
public void handleComment( final char[] data, final int pos ){}
@Override
public void handleError( final String errMsg, final int pos ){}
public String getText(){
return buffer.toString();
}
}
private static final ParserDelegator DELEGATOR = new ParserDelegator();
public static String extractText( final String html ) throws IOException{
TextStripper stripper = new TextStripper();
DELEGATOR.parse( new StringReader( html ), stripper, true );
return stripper.getText();
}
};
/
CREATE OR REPLACE FUNCTION HTML_TO_TEXT(
in_html IN VARCHAR2
) RETURN VARCHAR2
AS LANGUAGE JAVA NAME 'HTMLToText.extractText( java.lang.String ) return java.lang.String';
/
SHOW ERRORS;
/
查询:
SELECT HTML_TO_TEXT( '<html><body>Text<p>para<br>graph</p><table><tr><th>R1</th><td>C1</td><td>C2</td></tr><tr><th>R2</th><td>C1</td><td>C2</td></tr></table></body></html>' ) FROM DUAL;
输出:
HTML_TO_TEXT('<HTML><BODY>TEXT<P>PARA<BR>GRAPH</P><TABLE><TR><TH>R1</TH><TD>C1</
--------------------------------------------------------------------------------
Text
para
graph
R1 C1 C2
R2 C1 C2