Java源码示例:org.htmlparser.Node
示例1
public HTMLForm(Node form, String url, String commonJS) {
this.form = form;
this.formName = ((FormTag) form).getFormName();
if (this.formName == null)
this.formName = "form_"+Integer.toString(formCounter);
this.url = url;
this.z3FormFormulas = new HashSet<Formula>();
this.commonJS = commonJS;
this.jsValidation = new String("");
this.domRepresentation = new String();
this.windowRepresentation = new String();
this.helperFuns = new String();
}
示例2
/**
* Returns the number of columns/cells in the given row, including cell spacing.
*/
private static int getColumnCount( TableRow row )
{
Node[] cells = row.getChildren().extractAllNodesThatMatch( HTML_ROW_FILTER ).toNodeArray();
int cols = 0;
for ( Node cell : cells )
{
Integer colSpan = MathUtils.parseInt( ((TagNode) cell).getAttribute( "colspan" ) );
cols += colSpan != null ? colSpan : 1;
}
return cols;
}
示例3
/**
* Retrieves the value of a table cell. Appends the text of child nodes of
* the cell. In case of composite tags like span or div the inner text is
* appended.
*/
public static String getValue( TagNode cell )
{
StringBuilder builder = new StringBuilder();
for ( Node child : cell.getChildren().toNodeArray() )
{
if ( child instanceof CompositeTag )
{
builder.append( ((CompositeTag) child).getStringText() );
}
else
{
builder.append( child.getText() );
}
}
return builder.toString().trim().replaceAll( " ", EMPTY );
}
示例4
/**
* parses the body of the message, and returns a parsed representation
* See {@link http://htmlparser.sourceforge.net/} for details
* @param url the url that the message resulted from
* @param message the Message to parse
* @return a NodeList containing the various Nodes making up the page
*/
public Object parseMessage(HttpUrl url, Message message) {
String contentType = message.getHeader("Content-Type");
if (contentType == null || !contentType.matches("text/html.*")) {
return null;
}
byte[] content = message.getContent();
if (content == null || content.length == 0) {
return null;
}
Parser parser = Parser.createParser(new String(content), null);
try {
NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
return true;
}
});
return nodelist;
} catch (ParserException pe) {
_logger.severe(pe.toString());
return null;
}
}
示例5
public List<String> getGangliaAttribute(String clusterName)
throws ParserException, MalformedURLException, IOException {
String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
Parser parser = new Parser(new URL(url).openConnection());
NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
new HasAttributeFilter("id", "metrics-picker"));
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
SimpleNodeIterator iterator = nodeList.elements();
List<String> metricList = new ArrayList<String>();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
SimpleNodeIterator childIterator = node.getChildren().elements();
while (childIterator.hasMoreNodes()) {
OptionTag children = (OptionTag) childIterator.nextNode();
metricList.add(children.getOptionText());
}
}
return metricList;
}
示例6
public static void main(String[] args) throws Exception {
Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
new HasAttributeFilter("id", "metrics-picker"));
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
SimpleNodeIterator iterator = nodeList.elements();
while (iterator.hasMoreNodes()) {
Node node = iterator.nextNode();
SimpleNodeIterator childIterator = node.getChildren().elements();
while (childIterator.hasMoreNodes()) {
OptionTag children = (OptionTag) childIterator.nextNode();
System.out.println(children.getOptionText());
}
}
}
示例7
@Override
public Node nextNode() throws ParserException {
inJS = false;
inCSS = false;
if(cached != null) {
Node tmp = cached;
cached = null;
inJS = cachedJS;
inCSS = !cachedJS;
return tmp;
}
Node got = super.nextNode();
if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
cached = super.parseCDATA(true);
cachedJS = true;
} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
cached = super.parseCDATA(true);
cachedJS = false;
}
return got;
}
示例8
public static String html2Text(String html, int len) {
try {
Lexer lexer = new Lexer(html);
Node node;
StringBuilder sb = new StringBuilder(html.length());
while ((node = lexer.nextNode()) != null) {
if (node instanceof TextNode) {
sb.append(node.toHtml());
}
if (sb.length() > len) {
break;
}
}
return sb.toString();
} catch (ParserException e) {
throw new RuntimeException(e);
}
}
示例9
private void processHtml(HttpUrl base, NodeList nodelist) {
NodeFilter filter = new HasAttributeFilter("href");
filter = new OrFilter(filter, new HasAttributeFilter("src"));
filter = new OrFilter(filter, new HasAttributeFilter("onclick"));
filter = new OrFilter(filter, new HasAttributeFilter("onblur"));
try {
NodeList links = nodelist.extractAllNodesThatMatch(filter);
for (NodeIterator ni = links.elements(); ni.hasMoreNodes(); ) {
Node node = ni.nextNode();
if (node instanceof Tag) {
boolean got = false;
Tag tag = (Tag) node;
String src = tag.getAttribute("src");
if (src != null) {
processLink(base, src);
got = true;
}
String href = tag.getAttribute("href");
if (href != null) {
processLink(base, href);
got = true;
}
if (!got) {
// _logger.info("Didn't get anything from " + tag.getClass().getName() + ": " + tag);
}
}
}
} catch (ParserException pe) {
_logger.warning("ParserException : " + pe);
}
}
示例10
public void doParse(CDATALexer lex, Writer w) throws ParserException, IOException {
obs.handleDocumentStart();
Node n;
TextNode tx;
TagNode tn;
while(true) {
n = lex.nextNode();
if(n == null) {
break;
}
if(isRemarkNode(n)) {
obs.handleRemarkNode((RemarkNode)n);
} else if(isTextNode(n)) {
tx = (TextNode) n;
if(lex.inCSS()) {
obs.handleStyleNode(tx);
} else if(lex.inJS()) {
obs.handleScriptNode(tx);
} else {
obs.handleTextNode(tx);
}
} else {
tn = (TagNode) n;
if(tn.isEmptyXmlTag()) {
obs.handleTagEmpty(tn);
} else if(tn.isEndTag()) {
obs.handleTagClose(tn);
} else {
obs.handleTagOpen(tn);
}
}
if(w != null) {
w.write(n.toHtml(true));
}
}
obs.handleDocumentComplete();
}
示例11
public static boolean isTagNodeNamed(Node node, String name) {
if(isTagNode(node)) {
TagNode tagNode = (TagNode) node;
String nodeName = tagNode.getTagName();
return nodeName.equals(name.toUpperCase());
}
return false;
}
示例12
public static boolean isOpenTagNodeNamed(Node node, String name) {
if(isTagNode(node)) {
TagNode tagNode = (TagNode) node;
if(!tagNode.isEndTag()) {
String nodeName = tagNode.getTagName();
return nodeName.equals(name.toUpperCase());
}
}
return false;
}
示例13
public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) {
if(isTagNode(node)) {
TagNode tagNode = (TagNode) node;
if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) {
String nodeName = tagNode.getTagName();
return nodeName.equals(name.toUpperCase());
}
}
return false;
}
示例14
public static boolean isCloseTagNodeNamed(Node node, String name) {
if(isTagNode(node)) {
TagNode tagNode = (TagNode) node;
if(tagNode.isEndTag()) {
String nodeName = tagNode.getTagName();
return nodeName.equals(name.toUpperCase());
}
}
return false;
}
示例15
@Transactional(readOnly = true)
public String attachKeyword(Integer siteId, String txt) {
if (StringUtils.isBlank(txt)) {
return txt;
}
List<CmsKeyword> list = getListBySiteId(siteId, true, true);
int len = list.size();
if (len <= 0) {
return txt;
}
String[] searchArr = new String[len];
String[] replacementArr = new String[len];
int i = 0;
for (CmsKeyword k : list) {
searchArr[i] = k.getName();
replacementArr[i] = k.getUrl();
i++;
}
try {
Lexer lexer = new Lexer(txt);
Node node;
StringBuilder sb = new StringBuilder((int) (txt.length() * 1.2));
while ((node = lexer.nextNode()) != null) {
if (node instanceof TextNode) {
sb.append(StringUtils.replaceEach(node.toHtml(), searchArr,
replacementArr));
} else {
sb.append(node.toHtml());
}
}
return sb.toString();
} catch (ParserException e) {
throw new RuntimeException(e);
}
}
示例16
public Node getForm() {
return form;
}
示例17
public void setForm(Node form) {
this.form = form;
}
示例18
public static void parseFromString(String content, Connection conn) throws Exception {
Parser parser = new Parser(content);
HasAttributeFilter filter = new HasAttributeFilter("href");
String sql1 = null;
ResultSet rs1 = null;
PreparedStatement pstmt1 = null;
Statement stmt1 = null;
List<String> nextLinkList = new ArrayList<String>();
int rowCount = 0;
sql1 = "select count(*) as rowCount from record";
stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
rs1 = stmt1.executeQuery(sql1);
if (rs1.next()) {
rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0;
}
if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table
try {
NodeList list = parser.parse(filter);
int count = list.size();
//process every link on this page
for (int i = 0; i < count; i++) {
Node node = list.elementAt(i);
if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
String nextLink = link.extractLink();
String mainUrl = Constants.MAINURL;
if (nextLink.startsWith(mainUrl)) {
//check if the link already exists in the database
sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'";
stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
rs1 = stmt1.executeQuery(sql1);
if (rs1.next()) {
} else {
Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP);
Matcher movieMatcher = moviePattern.matcher(nextLink);
Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP);
Matcher commentMatcher = commentPattern.matcher(nextLink);
if (movieMatcher.find() || commentMatcher.find()) {
nextLinkList.add(nextLink);
}
}
}
}
}
if (nextLinkList.size() > 0) {
conn.setAutoCommit(false);
//if the link does not exist in the database, insert it
sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)";
pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS);
for (String nextLinkStr : nextLinkList) {
pstmt1.setString(1, nextLinkStr);
pstmt1.addBatch();
System.out.println(nextLinkStr);
}
pstmt1.executeBatch();
conn.commit();
}
} catch (Exception e) {
//handle the exceptions
e.printStackTrace();
System.out.println("SQLException: " + e.getMessage());
} finally {
//close and release the resources of PreparedStatement, ResultSet and Statement
if (pstmt1 != null) {
try {
pstmt1.close();
} catch (SQLException e2) {
}
}
pstmt1 = null;
if (rs1 != null) {
try {
rs1.close();
} catch (SQLException e1) {
}
}
rs1 = null;
if (stmt1 != null) {
try {
stmt1.close();
} catch (SQLException e3) {
}
}
stmt1 = null;
}
}
}
示例19
public static boolean isTagNode(Node node) {
return (node instanceof TagNode);
}
示例20
public static boolean isTextNode(Node node) {
return (node instanceof TextNode);
}
示例21
public static boolean isRemarkNode(Node node) {
return (node instanceof RemarkNode);
}