From 37c43e63b8349bcc81556fc4cac2bb8eb66e20a0 Mon Sep 17 00:00:00 2001 From: Artorias Date: Thu, 6 Mar 2025 17:47:33 +0100 Subject: [PATCH] Implemented an XMI extractor for extracting and uploading already analized data in given files. --- .../project/gruppe_05_1/nlp/XmiExtractor.java | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java new file mode 100644 index 0000000..3e56e40 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java @@ -0,0 +1,167 @@ +package org.texttechnologylab.project.gruppe_05_1.nlp; + +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; +import com.mongodb.client.model.Filters; +import com.mongodb.client.result.UpdateResult; +import org.apache.uima.fit.util.JCasUtil; +import org.bson.Document; +import java.io.*; +import java.util.zip.*; +import java.util.*; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; +import org.bson.conversions.Bson; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.apache.uima.cas.impl.XmiCasDeserializer; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; + +public class XmiExtractor { + private MongoDatabase database; // Removed 'static' + + public XmiExtractor(MongoDatabase database) { + this.database = database; + } + + public void extractAndUploadXmiData() throws IOException { + try (ZipInputStream zis = new ZipInputStream(new FileInputStream("C:/Users/Leon/Desktop/Uni/PPR/multimodal_parliament_explorer_05_1/src/main/resources/speeches/20.zip"))) { + ZipEntry entry; + while ((entry = zis.getNextEntry()) != null) { + if (entry.getName().endsWith(".xmi.gz")) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int len; + while ((len = zis.read(buffer)) > 0) { + baos.write(buffer, 0, len); + } + byte[] entryData = baos.toByteArray(); + processXmiGzStream(new ByteArrayInputStream(entryData), entry.getName()); + } + zis.closeEntry(); + } + } + } + + private void processXmiGzStream(InputStream inputStream, String filename) { + try (GZIPInputStream gis = new GZIPInputStream(inputStream)) { + JCas jCas = JCasFactory.createJCas(); + XmiCasDeserializer.deserialize(gis, jCas.getCas(), true); + + // Build structured analysisResults Document + Document analysisResults = new Document(); + + // Tokens: Include POS, Lemma, etc. + List tokens = new ArrayList<>(); + for (Token token : JCasUtil.select(jCas, Token.class)) { + Document tokenDoc = new Document() + .append("text", token.getCoveredText()) + .append("pos", token.getPos().getPosValue()) + .append("lemma", token.getLemma().getValue()); + tokens.add(tokenDoc); + } + analysisResults.append("tokens", tokens); + + // Sentences + List sentences = JCasUtil.select(jCas, Sentence.class).stream() + .map(Sentence::getCoveredText) + .collect(Collectors.toList()); + analysisResults.append("sentences", sentences); + + // Dependencies + List dependencies = new ArrayList<>(); + for (Dependency dep : JCasUtil.select(jCas, Dependency.class)) { + Document depDoc = new Document() + .append("type", dep.getDependencyType()) + .append("governor", dep.getGovernor().getCoveredText()) + .append("dependent", dep.getDependent().getCoveredText()); + dependencies.add(depDoc); + } + analysisResults.append("dependencies", dependencies); + + // Named Entities + List namedEntities = new ArrayList<>(); + for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) { + Document neDoc = new Document() + .append("text", ne.getCoveredText()) + .append("type", ne.getValue()); + namedEntities.add(neDoc); + } + analysisResults.append("namedEntities", namedEntities); + + // Sentiment + List sentiments = new ArrayList<>(); + for (org.hucompute.textimager.uima.type.Sentiment sentiment : + JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) { + + Document sentimentDoc = new Document() + .append("begin", sentiment.getBegin()) + .append("end", sentiment.getEnd()) + .append("score", sentiment.getSentiment()) + .append("subjectivity", sentiment.getSubjectivity()); + + // Check for VaderSentiment subtype + if (sentiment instanceof org.hucompute.textimager.uima.type.VaderSentiment) { + org.hucompute.textimager.uima.type.VaderSentiment vader = + (org.hucompute.textimager.uima.type.VaderSentiment) sentiment; + sentimentDoc + .append("pos", vader.getPos()) + .append("neu", vader.getNeu()) + .append("neg", vader.getNeg()); + } + sentiments.add(sentimentDoc); + } + analysisResults.append("sentiments", sentiments); + + + // TODO Hochladen Topics implementieren + + // Upload structured Document to MongoDB + String speechKey = extractSpeechKeyFromFilename(filename); + if (speechKey != null) { + uploadToMongoDB(speechKey, analysisResults); + } + + } catch (Exception e) { + e.printStackTrace(); + } + } + + private static String extractSpeechKeyFromFilename(String filename) { + String baseName = filename.replace(".xmi.gz", ""); + return baseName.replace("20/", ""); + } + + private void uploadToMongoDB(String speechKey, Document analysisResults) { + MongoCollection collection = database.getCollection("speech"); + Bson filter = Filters.eq("speechKey", speechKey); + Bson update = new Document("$set", new Document("analysisResults", analysisResults)); + + UpdateResult result = collection.updateOne(filter, update); + if (result.getModifiedCount() > 0) { + System.out.println("Structured analysisResults updated for speechKey: " + speechKey); + } else { + System.out.println("Data has already been uploaded for speechKey: " + speechKey); + } + } + + public MongoDatabase getDatabase() { + return this.database; + } + + /*public static void main(String[] args) { + try { + MongoDBHandler mongoDBHandler = new MongoDBHandler(); + MongoDatabase database = mongoDBHandler.getDatabase(); // Now works! + XmiExtractor extractor = new XmiExtractor(database); + extractor.extractAndUploadXmiData(); + System.out.println("Processing complete."); + } catch (Exception e) { + e.printStackTrace(); + } + } */ +} +