我不熟悉使用Java使用Apache Spark进行文本挖掘。我正在尝试对文本数据进行LDA。 首先,我使用IDF模型提取相关单词。然后,我创建一个LDA模型来获取我的主题。结果,我得到一个带有termIndices和termWeights的表。
如何从我的lda模型中获取主题作为单词?
这是我使用的代码:
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.ml.clustering.LDA;
import org.apache.spark.ml.clustering.LDAModel;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.Idfmodel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.ml.feature.Tokenizer;
import java.util.Arrays;
import java.util.List;
public class Main {
public static void main(String[] args){
Logger.getLogger("org.apache").setLevel(Level.WARN);
SparkSession sparkSession = SparkSession.builder().appName("testing").master("local[*]").getOrCreate();
List<Row> data = Arrays.asList(
RowFactory.create(0.0,"Hi I heard about Spark"),RowFactory.create(0.0,"I wish Java could use case classes"),RowFactory.create(1.0,"Logistic regression models are neat")
);
StructType schema = new StructType(new StructField[]{
new StructField("label",DataTypes.DoubleType,false,Metadata.empty()),new StructField("sentence",DataTypes.StringType,Metadata.empty())
});
Dataset<Row> sentenceData = sparkSession.createDataFrame(data,schema);
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(sentenceData);
int numFeatures = 20;
HashingTF hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(numFeatures);
Dataset<Row> featurizedData = hashingTF.transform(wordsData);
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
Idfmodel idfmodel = idf.fit(featurizedData);
Dataset<Row> rescaledData = idfmodel.transform(featurizedData);
rescaledData.select("label","features").show();
LDA lda = new LDA().setK(2).setMaxIter(10).setOptimizer("em").setfeaturesCol("features");
LDAModel ldaModel = lda.fit(rescaledData);
ldaModel.describeTopics().show();
Dataset<Row> transformed = ldaModel.transform(rescaledData);
transformed.show();
sparkSession.close();
}
}
这是我的代码的输出:
+-----+--------------------+
|label| features|
+-----+--------------------+
| 0.0|(20,[0,5,9,17],[0...|
| 0.0|(20,[2,7,13,15]...|
| 1.0|(20,[4,6,15,18...|
+-----+--------------------+
+-----+--------------------+--------------------+
|topic| termIndices| termWeights|
+-----+--------------------+--------------------+
| 0|[17,2,...|[0.16715273617466...|
| 1|[17,18,...|[0.15751266315244...|
+-----+--------------------+--------------------+
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label| sentence| words| rawFeatures| features| topicDistribution|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
| 0.0|Hi I heard about ...|[hi,i,heard,ab...|(20,[1...|(20,[0...|[0.50052275837267...|
| 0.0|I wish Java could...|[i,wish,java,c...|(20,15]...|(20,15]...|[0.49871227849509...|
| 1.0|Logistic regressi...|[logistic,regres...|(20,18...|(20,18...|[0.50063942630916...|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+