Implemented an XMI extractor for extracting and uploading already analized data in given files.

This commit is contained in:
Artorias 2025-03-06 17:47:33 +01:00
parent 7113637f00
commit 37c43e63b8

View file

@ -0,0 +1,167 @@
package org.texttechnologylab.project.gruppe_05_1.nlp;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.result.UpdateResult;
import org.apache.uima.fit.util.JCasUtil;
import org.bson.Document;
import java.io.*;
import java.util.zip.*;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import org.bson.conversions.Bson;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
public class XmiExtractor {
private MongoDatabase database; // Removed 'static'
public XmiExtractor(MongoDatabase database) {
this.database = database;
}
public void extractAndUploadXmiData() throws IOException {
try (ZipInputStream zis = new ZipInputStream(new FileInputStream("C:/Users/Leon/Desktop/Uni/PPR/multimodal_parliament_explorer_05_1/src/main/resources/speeches/20.zip"))) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
if (entry.getName().endsWith(".xmi.gz")) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len;
while ((len = zis.read(buffer)) > 0) {
baos.write(buffer, 0, len);
}
byte[] entryData = baos.toByteArray();
processXmiGzStream(new ByteArrayInputStream(entryData), entry.getName());
}
zis.closeEntry();
}
}
}
private void processXmiGzStream(InputStream inputStream, String filename) {
try (GZIPInputStream gis = new GZIPInputStream(inputStream)) {
JCas jCas = JCasFactory.createJCas();
XmiCasDeserializer.deserialize(gis, jCas.getCas(), true);
// Build structured analysisResults Document
Document analysisResults = new Document();
// Tokens: Include POS, Lemma, etc.
List<Document> tokens = new ArrayList<>();
for (Token token : JCasUtil.select(jCas, Token.class)) {
Document tokenDoc = new Document()
.append("text", token.getCoveredText())
.append("pos", token.getPos().getPosValue())
.append("lemma", token.getLemma().getValue());
tokens.add(tokenDoc);
}
analysisResults.append("tokens", tokens);
// Sentences
List<String> sentences = JCasUtil.select(jCas, Sentence.class).stream()
.map(Sentence::getCoveredText)
.collect(Collectors.toList());
analysisResults.append("sentences", sentences);
// Dependencies
List<Document> dependencies = new ArrayList<>();
for (Dependency dep : JCasUtil.select(jCas, Dependency.class)) {
Document depDoc = new Document()
.append("type", dep.getDependencyType())
.append("governor", dep.getGovernor().getCoveredText())
.append("dependent", dep.getDependent().getCoveredText());
dependencies.add(depDoc);
}
analysisResults.append("dependencies", dependencies);
// Named Entities
List<Document> namedEntities = new ArrayList<>();
for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
Document neDoc = new Document()
.append("text", ne.getCoveredText())
.append("type", ne.getValue());
namedEntities.add(neDoc);
}
analysisResults.append("namedEntities", namedEntities);
// Sentiment
List<Document> sentiments = new ArrayList<>();
for (org.hucompute.textimager.uima.type.Sentiment sentiment :
JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) {
Document sentimentDoc = new Document()
.append("begin", sentiment.getBegin())
.append("end", sentiment.getEnd())
.append("score", sentiment.getSentiment())
.append("subjectivity", sentiment.getSubjectivity());
// Check for VaderSentiment subtype
if (sentiment instanceof org.hucompute.textimager.uima.type.VaderSentiment) {
org.hucompute.textimager.uima.type.VaderSentiment vader =
(org.hucompute.textimager.uima.type.VaderSentiment) sentiment;
sentimentDoc
.append("pos", vader.getPos())
.append("neu", vader.getNeu())
.append("neg", vader.getNeg());
}
sentiments.add(sentimentDoc);
}
analysisResults.append("sentiments", sentiments);
// TODO Hochladen Topics implementieren
// Upload structured Document to MongoDB
String speechKey = extractSpeechKeyFromFilename(filename);
if (speechKey != null) {
uploadToMongoDB(speechKey, analysisResults);
}
} catch (Exception e) {
e.printStackTrace();
}
}
private static String extractSpeechKeyFromFilename(String filename) {
String baseName = filename.replace(".xmi.gz", "");
return baseName.replace("20/", "");
}
private void uploadToMongoDB(String speechKey, Document analysisResults) {
MongoCollection<Document> collection = database.getCollection("speech");
Bson filter = Filters.eq("speechKey", speechKey);
Bson update = new Document("$set", new Document("analysisResults", analysisResults));
UpdateResult result = collection.updateOne(filter, update);
if (result.getModifiedCount() > 0) {
System.out.println("Structured analysisResults updated for speechKey: " + speechKey);
} else {
System.out.println("Data has already been uploaded for speechKey: " + speechKey);
}
}
public MongoDatabase getDatabase() {
return this.database;
}
/*public static void main(String[] args) {
try {
MongoDBHandler mongoDBHandler = new MongoDBHandler();
MongoDatabase database = mongoDBHandler.getDatabase(); // Now works!
XmiExtractor extractor = new XmiExtractor(database);
extractor.extractAndUploadXmiData();
System.out.println("Processing complete.");
} catch (Exception e) {
e.printStackTrace();
}
} */
}