|
马上注册,结交更多好友,享用更多功能,让你轻松玩转社区。
您需要 登录 才可以下载或查看,没有帐号?立即注册
x
一旦你有了思想,那你编的程序就有了灵魂,不管是什么语言到了你的手里都会是你的工具而已,他们的价值是能尽快帮助你实现你想要的目标。但是如果你没有了思想,那就像是海里的帆船失去了船帆,是很难到打海的另一边的。 为了撑持全文检索,有需要将HTML格局的文章转化为纯文本格局,因而我计划了一个基础的WebFormatter类,供应一个复杂的publicstaticStringhtml2text(Stringhtml),将HTML格局转化为Text:
/*
*File:WebFormatter.java
*Createdon2005-6-24
*Author:LiaoXuefeng,asklxf@163.com
*Copyright(C)2005,LiaoXuefeng.
*/
packagecom.mboker.blog.web.util;
importjava.util.*;
importjava.text.SimpleDateFormat;
/**
*Dosomeformatonwebdisplay.
*
*@authorXuefeng
*/
publicclassWebFormatter{
publicstaticStringhtml2text(Stringhtml){
StringBuffersb=newStringBuffer(html.length());
char[]data=html.toCharArray();
intstart=0;
booleanpreviousIsPre=false;
Tokentoken=null;
for(;;){
token=parse(data,start,previousIsPre);
if(token==null)
break;
previousIsPre=token.isPreTag();
sb=sb.append(token.getText());
start+=token.getLength();
}
returnsb.toString();
}
privatestaticTokenparse(char[]data,intstart,booleanpreviousIsPre){
if(start>=data.length)
returnnull;
//trytoreadnextchar:
charc=data[start];
if(c==<){
//thisisatagorcommentorscript:
intend_index=indexOf(data,start+1,>);
if(end_index==(-1)){
//theleftisalltext!
returnnewToken(Token.TOKEN_TEXT,data,start,data.length,previousIsPre);
}
Strings=newString(data,start,end_index-start+1);
//nowwegots="<...>":
if(s.startsWith("<!--")){//thisisacomment!
intend_comment_index=indexOf(data,start+1,"-->");
if(end_comment_index==(-1)){
//illegalend,buttreatascomment:
returnnewToken(Token.TOKEN_COMMENT,data,start,data.length,previousIsPre);
}
else
returnnewToken(Token.TOKEN_COMMENT,data,start,end_comment_index+3,previousIsPre);
}
Strings_lowerCase=s.toLowerCase();
if(s_lowerCase.startsWith("<script")){//thisisascript:
intend_script_index=indexOf(data,start+1,"</script>");
if(end_script_index==(-1))
//illegalend,buttreatasscript:
returnnewToken(Token.TOKEN_SCRIPT,data,start,data.length,previousIsPre);
else
returnnewToken(Token.TOKEN_SCRIPT,data,start,end_script_index+9,previousIsPre);
}
else{//thisisatag:
returnnewToken(Token.TOKEN_TAG,data,start,start+s.length(),previousIsPre);
}
}
//thisisatext:
intnext_tag_index=indexOf(data,start+1,<);
if(next_tag_index==(-1))
returnnewToken(Token.TOKEN_TEXT,data,start,data.length,previousIsPre);
returnnewToken(Token.TOKEN_TEXT,data,start,next_tag_index,previousIsPre);
}
privatestaticintindexOf(char[]data,intstart,Strings){
char[]ss=s.toCharArray();
//TODO:performancecanimprove!
for(inti=start;i<(data.length-ss.length);i++){
//comparefromdata[i]withss[0]:
booleanmatch=true;
for(intj=0;j<ss.length;j++){
if(data[i+j]!=ss[j]){
match=false;
break;
}
}
if(match)
returni;
}
return(-1);
}
privatestaticintindexOf(char[]data,intstart,charc){
for(inti=start;i<data.length;i++){
if(data[i]==c)
returni;
}
return(-1);
}
}
classToken{
publicstaticfinalintTOKEN_TEXT=0;//htmltext.
publicstaticfinalintTOKEN_COMMENT=1;//commentlike<!--comments...-->
publicstaticfinalintTOKEN_TAG=2;//taglike<pre>,<font>,etc.
publicstaticfinalintTOKEN_SCRIPT=3;
privatestaticfinalchar[]TAG_BR="<br".toCharArray();
privatestaticfinalchar[]TAG_P="<p".toCharArray();
privatestaticfinalchar[]TAG_LI="<li".toCharArray();
privatestaticfinalchar[]TAG_PRE="<pre".toCharArray();
privatestaticfinalchar[]TAG_HR="<hr".toCharArray();
privatestaticfinalchar[]END_TAG_TD="</td>".toCharArray();
privatestaticfinalchar[]END_TAG_TR="</tr>".toCharArray();
privatestaticfinalchar[]END_TAG_LI="</li>".toCharArray();
privatestaticfinalMapSPECIAL_CHARS=newHashMap();
privateinttype;
privateStringhtml;//originalhtml
privateStringtext=null;//text!
privateintlength=0;//htmllength
privatebooleanisPre=false;//isPretag?
static{
SPECIAL_CHARS.put(""",""");
SPECIAL_CHARS.put("<","<");
SPECIAL_CHARS.put(">",">");
SPECIAL_CHARS.put("&","&");
SPECIAL_CHARS.put(" |
|