Java源码示例:org.htmlparser.Node

示例1
public HTMLForm(Node form, String url,  String commonJS) {
	this.form = form;

	this.formName = ((FormTag) form).getFormName();
	if (this.formName == null)
		this.formName = "form_"+Integer.toString(formCounter);
	this.url = url; 
	this.z3FormFormulas = new HashSet<Formula>();

	this.commonJS = commonJS;
	this.jsValidation = new String("");
	this.domRepresentation = new String();
	this.windowRepresentation = new String();
	this.helperFuns = new String();

}
 
示例2
/**
 * Returns the number of columns/cells in the given row, including cell spacing.
 */
private static int getColumnCount( TableRow row )
{
    Node[] cells = row.getChildren().extractAllNodesThatMatch( HTML_ROW_FILTER ).toNodeArray();

    int cols = 0;

    for ( Node cell : cells )
    {
        Integer colSpan = MathUtils.parseInt( ((TagNode) cell).getAttribute( "colspan" ) );

        cols += colSpan != null ? colSpan : 1;
    }

    return cols;
}
 
示例3
/**
 * Retrieves the value of a table cell. Appends the text of child nodes of
 * the cell. In case of composite tags like span or div the inner text is
 * appended.
 */
public static String getValue( TagNode cell )
{
    StringBuilder builder = new StringBuilder();

    for ( Node child : cell.getChildren().toNodeArray() )
    {
        if ( child instanceof CompositeTag )
        {
            builder.append( ((CompositeTag) child).getStringText() );
        }
        else
        {
            builder.append( child.getText() );
        }
    }

    return builder.toString().trim().replaceAll( "&nbsp;", EMPTY );
}
 
示例4
/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }
 
示例5
public List<String> getGangliaAttribute(String clusterName)
		throws ParserException, MalformedURLException, IOException {
	String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
	Parser parser = new Parser(new URL(url).openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	List<String> metricList = new ArrayList<String>();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			metricList.add(children.getOptionText());
		}
	}

	return metricList;

}
 
示例6
public static void main(String[] args) throws Exception {
	Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			System.out.println(children.getOptionText());
		}
	}

}
 
示例7
@Override
public Node nextNode() throws ParserException {
	inJS = false;
	inCSS = false;
	if(cached != null) {
		Node tmp = cached;
		cached = null;
		inJS = cachedJS;
		inCSS = !cachedJS;
		return tmp;
	}
	Node got = super.nextNode();
	if(NodeUtils.isNonEmptyOpenTagNodeNamed(got, "SCRIPT")) {
		cached = super.parseCDATA(true);
		cachedJS = true;
	} else if (NodeUtils.isNonEmptyOpenTagNodeNamed(got, "STYLE")) {
		cached = super.parseCDATA(true);
		cachedJS = false;
	}
	return got;
}
 
示例8
public static String html2Text(String html, int len) {
	try {
		Lexer lexer = new Lexer(html);
		Node node;
		StringBuilder sb = new StringBuilder(html.length());
		while ((node = lexer.nextNode()) != null) {
			if (node instanceof TextNode) {
				sb.append(node.toHtml());
			}
			if (sb.length() > len) {
				break;
			}
		}
		return sb.toString();
	} catch (ParserException e) {
		throw new RuntimeException(e);
	}
}
 
示例9
private void processHtml(HttpUrl base, NodeList nodelist) {
    NodeFilter filter = new HasAttributeFilter("href");
    filter = new OrFilter(filter, new HasAttributeFilter("src"));
    filter = new OrFilter(filter, new HasAttributeFilter("onclick"));
    filter = new OrFilter(filter, new HasAttributeFilter("onblur"));
    try {
        NodeList links = nodelist.extractAllNodesThatMatch(filter);
        for (NodeIterator ni = links.elements(); ni.hasMoreNodes(); ) {
            Node node = ni.nextNode();
            if (node instanceof Tag) {
                boolean got = false;
                Tag tag = (Tag) node;
                String src = tag.getAttribute("src");
                if (src != null) {
                    processLink(base, src);
                    got = true;
                }
                String href = tag.getAttribute("href");
                if (href != null) {
                    processLink(base, href);
                    got = true;
                }
                if (!got) {
                    // _logger.info("Didn't get anything from " + tag.getClass().getName() + ": " + tag);
                }
            }
        }
    } catch (ParserException pe) {
        _logger.warning("ParserException : " + pe);
    }
}
 
示例10
public void doParse(CDATALexer lex, Writer w) throws ParserException, IOException {
	obs.handleDocumentStart();
	Node n;
	TextNode tx;
	TagNode tn;
	while(true) {
		n = lex.nextNode();
		if(n == null) {
			break;
		}
		if(isRemarkNode(n)) {
			obs.handleRemarkNode((RemarkNode)n);
		} else if(isTextNode(n)) {
			tx = (TextNode) n;
			if(lex.inCSS()) {
				obs.handleStyleNode(tx);
			} else if(lex.inJS()) {
				obs.handleScriptNode(tx);
			} else {
				obs.handleTextNode(tx);
			}
		} else {
			tn = (TagNode) n;
			if(tn.isEmptyXmlTag()) {
				obs.handleTagEmpty(tn);
			} else if(tn.isEndTag()) {
				obs.handleTagClose(tn);
			} else {
				obs.handleTagOpen(tn);
			}
		}
		if(w != null) {
			w.write(n.toHtml(true));
		}
	}
	obs.handleDocumentComplete();
}
 
示例11
public static boolean isTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		String nodeName = tagNode.getTagName();
		return nodeName.equals(name.toUpperCase());
	}
	return false;
}
 
示例12
public static boolean isOpenTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		if(!tagNode.isEndTag()) {
			String nodeName = tagNode.getTagName();
			return nodeName.equals(name.toUpperCase());
		}
	}
	return false;
}
 
示例13
public static boolean isNonEmptyOpenTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		if(!tagNode.isEndTag() && !tagNode.isEmptyXmlTag()) {
			String nodeName = tagNode.getTagName();
			return nodeName.equals(name.toUpperCase());
		}
	}
	return false;
}
 
示例14
public static boolean isCloseTagNodeNamed(Node node, String name) {
	if(isTagNode(node)) {
		TagNode tagNode = (TagNode) node;
		if(tagNode.isEndTag()) {
			String nodeName = tagNode.getTagName();
			return nodeName.equals(name.toUpperCase());
		}
	}
	return false;
}
 
示例15
@Transactional(readOnly = true)
public String attachKeyword(Integer siteId, String txt) {
	if (StringUtils.isBlank(txt)) {
		return txt;
	}
	List<CmsKeyword> list = getListBySiteId(siteId, true, true);
	int len = list.size();
	if (len <= 0) {
		return txt;
	}
	String[] searchArr = new String[len];
	String[] replacementArr = new String[len];
	int i = 0;
	for (CmsKeyword k : list) {
		searchArr[i] = k.getName();
		replacementArr[i] = k.getUrl();
		i++;
	}
	try {
		Lexer lexer = new Lexer(txt);
		Node node;
		StringBuilder sb = new StringBuilder((int) (txt.length() * 1.2));
		while ((node = lexer.nextNode()) != null) {
			if (node instanceof TextNode) {
				sb.append(StringUtils.replaceEach(node.toHtml(), searchArr,
						replacementArr));
			} else {
				sb.append(node.toHtml());
			}
		}
		return sb.toString();
	} catch (ParserException e) {
		throw new RuntimeException(e);
	}
}
 
示例16
public Node getForm() {
	return form;
}
 
示例17
public void setForm(Node form) {
	this.form = form;
}
 
示例18
public static void parseFromString(String content, Connection conn) throws Exception {

        Parser parser = new Parser(content);
        HasAttributeFilter filter = new HasAttributeFilter("href");

        String sql1 = null;
        ResultSet rs1 = null;
        PreparedStatement pstmt1 = null;
        Statement stmt1 = null;

        List<String> nextLinkList = new ArrayList<String>();

        int rowCount = 0;
        sql1 = "select count(*) as rowCount from record";
        stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
        rs1 = stmt1.executeQuery(sql1);
        if (rs1.next()) {
            rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0;
        }

        if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table
            try {
                NodeList list = parser.parse(filter);
                int count = list.size();

                //process every link on this page
                for (int i = 0; i < count; i++) {
                    Node node = list.elementAt(i);

                    if (node instanceof LinkTag) {
                        LinkTag link = (LinkTag) node;
                        String nextLink = link.extractLink();
                        String mainUrl = Constants.MAINURL;

                        if (nextLink.startsWith(mainUrl)) {
                                //check if the link already exists in the database
                                sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'";
                                stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
                                rs1 = stmt1.executeQuery(sql1);
                                if (rs1.next()) {

                                } else {
                                    Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP);
                                    Matcher movieMatcher = moviePattern.matcher(nextLink);

                                    Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP);
                                    Matcher commentMatcher = commentPattern.matcher(nextLink);

                                    if (movieMatcher.find() || commentMatcher.find()) {
                                        nextLinkList.add(nextLink);
                                    }
                                }
                        }
                    }
                }
                if (nextLinkList.size() > 0) {
                    conn.setAutoCommit(false);
                    //if the link does not exist in the database, insert it
                    sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)";
                    pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS);
                    for (String nextLinkStr : nextLinkList) {
                        pstmt1.setString(1, nextLinkStr);
                        pstmt1.addBatch();
                        System.out.println(nextLinkStr);
                    }
                    pstmt1.executeBatch();
                    conn.commit();
                }
            } catch (Exception e) {
                //handle the exceptions
                e.printStackTrace();
                System.out.println("SQLException: " + e.getMessage());
            } finally {
                //close and release the resources of PreparedStatement, ResultSet and Statement
                if (pstmt1 != null) {
                    try {
                        pstmt1.close();
                    } catch (SQLException e2) {
                    }
                }
                pstmt1 = null;

                if (rs1 != null) {
                    try {
                        rs1.close();
                    } catch (SQLException e1) {
                    }
                }
                rs1 = null;

                if (stmt1 != null) {
                    try {
                        stmt1.close();
                    } catch (SQLException e3) {
                    }
                }
                stmt1 = null;
            }
        }
    }
 
示例19
public static boolean isTagNode(Node node) {
	return (node instanceof TagNode);
}
 
示例20
public static boolean isTextNode(Node node) {
	return (node instanceof TextNode);
}
 
示例21
public static boolean isRemarkNode(Node node) {
	return (node instanceof RemarkNode);
}