我正在使用Apache Spark的示例代码follow文档:https://spark.apache.org/docs/latest/ml-features.html#countvectorizer
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.ml.feature.CountVectorizer;
import org.apache.spark.ml.feature.CountVectorizerModel;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.*;
public class CountVectorizer_Demo {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("LDA Online").setMaster(
"local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
// Input data: Each row is a bag of words from a sentence or document.
JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
RowFactory.create(Arrays.asList("a", "b", "c")),
RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
));
StructType schema = new StructType(new StructField [] {
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
});
DataFrame df = sqlContext.createDataFrame(jrdd, schema);
// fit a CountVectorizerModel from the corpus
CountVectorizerModel cvModel = new CountVectorizer()
.setInputCol("text")
.setOutputCol("feature")
.setVocabSize(3)
.setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
.fit(df);
// alternatively, define CountVectorizerModel with a-priori vocabulary
CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
.setInputCol("text")
.setOutputCol("feature");
cvModel.transform(df).show();
}
}
但我收到错误消息:
22年10月15日23:04:20信息BlockManagerMaster:使用703.6 MB RAM注册block manager localhost:56882,BlockManagerId(,localhost,56882)22年10月15日23:04:20信息BlockManagerMaster:在线程“main”java中注册BlockManager异常。lang.NoClassDefFoundError:org/apache/spark/sql/catalyst/InternalRow at org。阿帕奇。火花ml.功能。CountVectorizerParams$类。在组织中验证AndTransferMSChema(CountVectorizer.scala:72)。阿帕奇。火花ml.功能。计数矢量器。在组织中验证AndTransferMSChema(CountVectorizer.scala:107)。阿帕奇。火花ml.功能。计数矢量器。org上的transformSchema(CountVectorizer.scala:168)。阿帕奇。火花ml.PipelineStage。org上的transformSchema(Pipeline.scala:62)。阿帕奇。火花ml.功能。计数矢量器。在main处安装(CountVectorizer.scala:130)。CountVectorizer\u演示。main(CountVectorizer\u Demo.java:39)由:java引起。lang.ClassNotFoundException:组织。阿帕奇。火花sql。催化剂java的InternalRow。网URLClassLoader 1美元。在java上运行(URLClassLoader.java:366)。网URLClassLoader 1美元。在java上运行(URLClassLoader.java:355)。安全AccessController。java上的doPrivileged(本机方法)。网URLClassLoader。java上的findClass(URLClassLoader.java:354)。lang.ClassLoader。sun的loadClass(ClassLoader.java:425)。杂项。启动器$AppClassLoader。java上的loadClass(Launcher.java:308)。lang.ClassLoader。loadClass(ClassLoader.java:358)。。。6个以上
提前感谢。
非常感谢大家。我通过添加依赖项解决了我的问题:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.10</artifactId>
<version>1.5.1</version>
</dependency>