Java源码示例:edu.stanford.nlp.ling.Word

示例1
@Override
public String[] tokenize(String sentence) {
	Reader r = new StringReader(sentence);
	PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
	List<String> l = new ArrayList<String>();
	while (tokenizer.hasNext()) {
		Word w = tokenizer.next();
		l.add(w.word());
	}
	String[] tok = new String[l.size() + 1];
	tok[0] = is2.io.CONLLReader09.ROOT;
	int i = 1;
	for (String s : l)
		tok[i++] = s;
	return tok;
}
 
示例2
public StringInText[] tokenizeplus(String sentence) {
	Reader r = new StringReader(sentence);
	PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
	List<StringInText> l = new ArrayList<StringInText>();
	while (tokenizer.hasNext()) {
		Word w = tokenizer.next();
		l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
				.endPosition() + startpos));
	}
	StringInText[] tok = new StringInText[l.size() + 1];
	tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
	int i = 1;
	for (StringInText s : l)
		tok[i++] = s;

	startpos += (1 + sentence.length());

	return tok;
}
 
示例3
/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}
 
示例4
public List<WordLemmaTag> processSentence(String sentence, boolean isTokenized)
{
	final StanfordLemmatizer lemmatizer = StanfordLemmatizer.getInstance();
	final StanfordPOSTagger tagger = StanfordPOSTagger.getInstance();
   	final List<WordLemmaTag> tlSentence = new ArrayList<WordLemmaTag>();
	
   	// the tagged sentence
   	List<TaggedWord> tSentence = null;
   	if (isTokenized) tSentence = tagger.tag(sentence);
   	else
   	{
   		StanfordTokenizer tokenizer = StanfordTokenizer.getInstance();
   		List<Word> tokens = tokenizer.tokenize(sentence);
   		tSentence = tagger.tag(tokens);
   	}
   	
   	// add to the lemmatized sentence
   	for (TaggedWord tw : tSentence) 
   		tlSentence.add(lemmatizer.lemmatize(tw));

   	return tlSentence;
}
 
示例5
public List<Word> tokenize(String string)
{ 
	this.tokenizer = 
		new PTBTokenizer<Word>(
				new StringReader(string), 
				new WordTokenFactory(), 
				"untokenizable=noneDelete,ptb3Escaping=true");
	try
	{
		return tokenizer.tokenize();
	}
	catch (Exception e)
	{
		System.err.println(e.getMessage());
		
		final List<Word> tokens = new ArrayList<Word>();
		for (String token : pennTokenizer.tokenize(string).split("\\s+"))
		{ 
			tokens.add(new Word(token));
		}
		return tokens;
	}
}
 
示例6
/**
 * Parses a sentence and returns a string representation of the parse tree.
 * 
 * @param sentence
 *            a sentence
 * @return Tree whose Label is a MapLabel containing correct begin and end
 *         character offsets in keys BEGIN_KEY and END_KEY
 */
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new
    // MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
 
示例7
/**
 * Combines the tokens into a <code>Sentence</code> 
 * 
 * @param tokens
 * @return <code>Sentence</code> made of the tokens
 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
	ArrayList<HasWord> wordList = new ArrayList<HasWord>();
	
	for (String s : tokens) {
		HasWord w = new Word(s);
		wordList.add(w);
	}
	
	Sentence sentence = new Sentence();
	sentence.setWords(wordList);
	
	return sentence;
}
 
示例8
public List<String> tokenizeString(String string)
{ 
	final List<String> tokens = new ArrayList<String>();
	for (Word w : tokenize(string))
	{
		tokens.add(w.word());
	}
	return tokens;
}
 
示例9
@Override
public List<InputWord> tag(final List<InputWord> input) {
	return tagger.tagSentence(input.stream().map(w -> new Word(w.word)).collect(Collectors.toList())).stream()
			.map(w -> new InputWord(w.word(), w.tag(), null)).collect(Collectors.toList());
}