code needs 21718261765125gb ram :D

2025-03-06 13:05:48 +01:00 · 2025-03-06 13:05:48 +01:00 · e2244a4b45
commit e2244a4b45
parent 4af6279324
8 changed files with 199 additions and 17 deletions
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
@ -5,6 +5,7 @@ import com.mongodb.client.MongoDatabase;
 import org.texttechnologylab.project.gruppe_05_1.database.*;
 import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb;
 import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument;
+import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils;
 import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler;
 import org.texttechnologylab.project.gruppe_05_1.util.Logger;
 import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
@ -56,12 +57,13 @@ public class Main {

        //TEST

-        Logger.pink("Parsing XML and inserting data into DB (Uebung 2)...");
+
        SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl();
        if (MongoPprUtils.getSpeechCollection().countDocuments() != 0) {
            System.out.println("Speeches werden nicht gelesen, da sie bereits in der Datenbank stehen");
        }
        else {
+            Logger.pink("Parsing XML and inserting data into DB (Uebung 2)...");
            SpeechIndex speechIndex = speechIndexFactory
                    .parseLegislativePeriods(TRUE)
                    .builder()
@ -91,10 +93,6 @@ public class Main {
            mongoDBHandler.close(); // Close the connection to the DB
        }

-
-        //TEST
-
-
        // Stellt fest, dass alle nötigen Datenbank-Collections existieren
        PPRUtils.ensureCollectionExist();

@ -102,6 +100,9 @@ public class Main {
        PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory);

        // NLP-Verarbeitung - TODO
+        NlpUtils.importXmiData();
+
+
        RESTHandler restHandler = new RESTHandler();
        restHandler.startJavalin();

--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java
@ -7,9 +7,7 @@ import com.mongodb.client.MongoClient;
 import com.mongodb.client.MongoClients;
 import com.mongodb.client.MongoCollection;
 import com.mongodb.client.MongoDatabase;
-import com.mongodb.client.model.Filters;
-import com.mongodb.client.model.Indexes;
-import com.mongodb.client.model.Updates;
+import com.mongodb.client.model.*;
 import exceptions.AgendaItemNotFoundException;
 import exceptions.MemberNotFoundException;
 import exceptions.ServerErrorException;
@ -537,7 +535,8 @@ public class MongoDBHandler {
                    .append("speechId", speech.getSpeechId())
                    .append("speakerId", speech.getSpeakerId())
                    .append("speakerName", speech.getSpeakerName())
-                    .append("fraction", speech.getFraction());
+                    .append("fraction", speech.getFraction())
+                    .append("speechKey", speech.getSpeechKey());

            // Convert speechContents to a list of Documents
            List<Document> contentDocuments = new ArrayList<>();
@ -638,6 +637,12 @@ public class MongoDBHandler {
        return result;
    }

+    public void updateXmiData(String speechKey, String xmiContent) {
+        speechesCollection.updateOne(
+                Filters.eq("speechKey", speechKey),
+                Updates.set("xmiData", xmiContent)
+        );
+    }

    public void deleteAllDocuments() {
        speechesCollection.deleteMany(new Document());
@ -646,6 +651,11 @@ public class MongoDBHandler {
        //historyCollection.deleteMany(new Document());
    }

+    public void bulkUpdateDocuments(String collectionName, List<WriteModel<Document>> updates) {
+        BulkWriteOptions options = new BulkWriteOptions().ordered(false);
+        speechesCollection.bulkWrite(updates, options);
+    }
+
    public void close() {
        mongoClient.close();
    }
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/domainimp/speeches/Speech_MongoDB_Impl.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/domainimp/speeches/Speech_MongoDB_Impl.java
@ -15,7 +15,10 @@ public class Speech_MongoDB_Impl  extends Speech_File_Impl implements Speech {
                mongoDocument.getInteger("speechId"),
                mongoDocument.getInteger("speakerId"),
                mongoDocument.getString("speakerName"),
-                mongoDocument.getString("fraction"));
+                mongoDocument.getString("fraction"),
+                mongoDocument.getString("speechKey")
+        );
+

        for (Document content : (List<Document>) mongoDocument.get("speechContents")) {
            switch (content.getString("type")) {
@ -32,5 +35,6 @@ public class Speech_MongoDB_Impl  extends Speech_File_Impl implements Speech {
                    throw new IllegalArgumentException("Unknown content type: " + content.getString("type"));
            }
        }
+
    }
 }
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java
@ -1,15 +1,29 @@
 package org.texttechnologylab.project.gruppe_05_1.nlp;

+import com.mongodb.client.model.Filters;
+import com.mongodb.client.model.UpdateOneModel;
+import com.mongodb.client.model.Updates;
+import com.mongodb.client.model.WriteModel;
 import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 import org.apache.commons.io.FileUtils;
 import org.apache.uima.UIMAException;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.impl.XmiCasDeserializer;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
 import org.apache.uima.fit.factory.JCasFactory;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.util.CasCreationUtils;
+import org.apache.uima.util.XMLInputSource;
+import org.bson.Document;
 import org.dkpro.core.io.xmi.XmiWriter;
 import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer;
 import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIDockerDriver;
@ -17,25 +31,31 @@ import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver;
 import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver;
 import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext;
 import org.texttechnologylab.annotation.NamedEntity;
+import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
 import org.texttechnologylab.uima.type.Sentiment;
 import org.xml.sax.SAXException;

-import java.io.File;
-import java.io.IOException;
+import java.io.*;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.Base64;
-import java.util.Collection;
+import java.util.*;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;

 import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
+import static org.springframework.core.io.buffer.DataBufferUtils.readInputStream;

 public class NlpUtils {

    // common class-attributes
    private static DUUIComposer pComposer = null;
    private static int iWorkers = 1;
+    private static final String TYPE_SYSTEM_DESCRIPTOR_PATH = "/speeches/TypeSystem.xml.gz";
+    private static final int MAX_FEATURE_LENGTH = 10000;


    public static void createNlpData() {
@ -310,4 +330,137 @@ public class NlpUtils {
    private static void createSentimentInfo() {

    }
+
+    /**
+     * Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip),
+     * iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um,
+     * verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags).
+     * Anschließend wird das jeweilige Dokument in MongoDB aktualisiert.
+     *
+     * @throws IOException falls ein Fehler beim Dateizugriff auftritt
+     */
+    public static void importXmiData() throws IOException {
+        MongoDBHandler mongoDBHandler = new MongoDBHandler();
+        List<WriteModel<Document>> bulkOperations = new ArrayList<>();
+        InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip");
+        if (zipStream == null) {
+            throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches");
+        }
+        try (ZipInputStream zis = new ZipInputStream(zipStream)) {
+            ZipEntry entry;
+            while ((entry = zis.getNextEntry()) != null) {
+                if (!entry.isDirectory() && entry.getName().endsWith(".gz")) {
+                    String speechKey = entry.getName().substring(0, entry.getName().length() - 3);
+                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+                    byte[] buffer = new byte[4096];
+                    int len;
+                    while ((len = zis.read(buffer)) != -1) {
+                        baos.write(buffer, 0, len);
+                    }
+                    byte[] gzData = baos.toByteArray();
+                    try {
+
+                        ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream();
+                        try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) {
+                            byte[] buf = new byte[4096];
+                            int bytesRead;
+                            while ((bytesRead = gzis.read(buf)) != -1) {
+                                decompressedBaos.write(buf, 0, bytesRead);
+                            }
+                        }
+                        byte[] xmiBytes = decompressedBaos.toByteArray();
+                        String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8);
+                        JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes));
+                        List<Map<String, Object>> annotations = processJCas(jcas);
+                        Document filter = new Document("RedeID", speechKey);
+                        Document updateFields = new Document()
+                                .append("annotations", annotations)
+                                .append("xmi", xmiContent);
+                        Document update = new Document("$set", updateFields);
+                        UpdateOneModel<Document> updateModel = new UpdateOneModel<>(filter, update);
+                        bulkOperations.add(updateModel);
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }
+                zis.closeEntry();
+            }
+        }
+        if (!bulkOperations.isEmpty()) {
+            mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations);
+            System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente.");
+        }
+        mongoDBHandler.close();
+    }
+
+
+    /**
+     * Liest einen InputStream vollständig in einen String ein.
+     *
+     * @param is InputStream
+     * @return Den gesamten String-Inhalt
+     * @throws IOException falls ein Fehler beim Lesen auftritt
+     */
+    private static String readInputStream(InputStream is) throws IOException {
+        BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
+        StringBuilder sb = new StringBuilder();
+        String line;
+        while ((line = reader.readLine()) != null) {
+            sb.append(line).append("\n");
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um.
+     * Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert.
+     *
+     * @param xmiInputStream InputStream des XMI-Inhalts
+     * @return Den konvertierten JCas
+     * @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt
+     */
+    private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception {
+        InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH);
+        if (tsCompressedStream == null) {
+            throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH);
+        }
+        try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) {
+            XMLInputSource inputSource = new XMLInputSource(tsStream);
+            TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource);
+            CAS cas = CasCreationUtils.createCas(tsd, null, null);
+            XmiCasDeserializer.deserialize(xmiInputStream, cas, true);
+            return cas.getJCas();
+        }
+    }
+
+    /**
+     * Iteriert über alle Annotationen im JCas und verarbeitet diese.
+     * Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten.
+     *
+     * @param jcas Der konvertierte JCas
+     */
+    private static List<Map<String, Object>> processJCas(JCas jcas) {
+        List<Map<String, Object>> annotationsData = new ArrayList<>();
+        CAS cas = jcas.getCas();
+        AnnotationIndex<AnnotationFS> index = cas.getAnnotationIndex();
+        for (AnnotationFS annotation : index) {
+            Map<String, Object> annotationData = new HashMap<>();
+            annotationData.put("type", annotation.getType().getName());
+            Map<String, String> featuresMap = new HashMap<>();
+            for (Feature feature : annotation.getType().getFeatures()) {
+                String featureName = feature.getShortName();
+                String featureValue = annotation.getFeatureValueAsString(feature);
+                if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) {
+                    featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency
+                }
+                featuresMap.put(featureName, featureValue);
+                featuresMap.put(featureName, featureValue);
+            }
+            annotationData.put("features", featuresMap);
+
+            annotationsData.add(annotationData);
+        }
+        return annotationsData;
+    }
+
 }
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Impls/Speech_File_Impl.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Impls/Speech_File_Impl.java
@ -17,6 +17,7 @@ public class Speech_File_Impl implements Speech {
    private final String speakerName;
    private final String fraction;
    private final List<Content> speechContents;
+    private final String speechKey;

    @Override
    public int getSessionId() {
@ -53,13 +54,19 @@ public class Speech_File_Impl implements Speech {
        return speechContents;
    }

-    public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction) {
+    @Override
+    public String getSpeechKey() {
+        return speechKey;
+    }
+
+    public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction, String speechKey) {
        this.speakerId = speakerId;
        this.agendaItemId = agendaItemId;
        this.speechId = speechId;
        this.speakerName = speakerName;
        this.fraction = fraction;
        this.sessionId = sessionId;
+        this.speechKey = speechKey;
        this.speechContents = new ArrayList<>();
    }

--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Interfaces/Speech.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Interfaces/Speech.java
@ -54,6 +54,13 @@ public interface Speech {
     */
    int getSessionId();

+    /**
+     * Returns the session Key.
+     *
+     * @return The session Key.
+     */
+    String getSpeechKey();
+
    /**
     * Returns the speech contents.
     *
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java
@ -112,14 +112,14 @@ public class SpeechParser {
                int speakerId = Integer.parseInt(speakerElement.getAttribute("id"));
                Element nameElement = (Element) speakerElement.getElementsByTagName("name").item(0);
                if (nameElement == null) continue;
-
+                String redeID = speechElement.getAttribute("id");
                String title = getOptionalTextContent(nameElement, "titel");
                String firstName = getOptionalTextContent(nameElement, "vorname");
                String lastName = getOptionalTextContent(nameElement, "nachname");
                String fraction = getOptionalTextContent(nameElement, "fraktion");

                String speakerName = (title != null ? title + " " : "") + firstName + " " + lastName;
-                Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction);
+                Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction, redeID);

                // Add the speaker to speech contents
                speech.addContent(new Speaker_File_Impl(0, speechId, speakerId, speakerName, fraction));
--- a/src/main/resources/speeches/TypeSystem.xml.gz
+++ b/src/main/resources/speeches/TypeSystem.xml.gz