Implemented an XMI extractor for extracting and uploading already analized data in given files.

2025-03-06 17:40:55 +01:00 · 2025-03-06 17:40:55 +01:00 · 7113637f00
commit 7113637f00
parent e2244a4b45
3 changed files with 8 additions and 133 deletions
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
@ -6,6 +6,7 @@ import org.texttechnologylab.project.gruppe_05_1.database.*;
 import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb;
 import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument;
 import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils;
+import org.texttechnologylab.project.gruppe_05_1.nlp.XmiExtractor;
 import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler;
 import org.texttechnologylab.project.gruppe_05_1.util.Logger;
 import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
@ -99,8 +100,10 @@ public class Main {
        // Alle Informationen (Parlamentarier, Reden, Kommentare etc.) lesen und in die Mongo-DB einfügen, falls diese noch nicht vorhanden sind.
        PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory);

+        // Upload der analysierten XMI dateien
+
        // NLP-Verarbeitung - TODO
-        NlpUtils.importXmiData();
+        // NlpUtils.importXmiData();


        RESTHandler restHandler = new RESTHandler();
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java
@ -94,6 +94,10 @@ public class MongoDBHandler {

    }

+    public MongoDatabase getDatabase() {
+        return this.database;
+    }
+
    /**
     * Get the MongoDB according to properties.
     * If a local server URI is defined, use it. Otherwise, use remote server.
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java
@ -331,136 +331,4 @@ public class NlpUtils {

    }

-    /**
-     * Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip),
-     * iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um,
-     * verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags).
-     * Anschließend wird das jeweilige Dokument in MongoDB aktualisiert.
-     *
-     * @throws IOException falls ein Fehler beim Dateizugriff auftritt
-     */
-    public static void importXmiData() throws IOException {
-        MongoDBHandler mongoDBHandler = new MongoDBHandler();
-        List<WriteModel<Document>> bulkOperations = new ArrayList<>();
-        InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip");
-        if (zipStream == null) {
-            throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches");
-        }
-        try (ZipInputStream zis = new ZipInputStream(zipStream)) {
-            ZipEntry entry;
-            while ((entry = zis.getNextEntry()) != null) {
-                if (!entry.isDirectory() && entry.getName().endsWith(".gz")) {
-                    String speechKey = entry.getName().substring(0, entry.getName().length() - 3);
-                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
-                    byte[] buffer = new byte[4096];
-                    int len;
-                    while ((len = zis.read(buffer)) != -1) {
-                        baos.write(buffer, 0, len);
-                    }
-                    byte[] gzData = baos.toByteArray();
-                    try {
-
-                        ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream();
-                        try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) {
-                            byte[] buf = new byte[4096];
-                            int bytesRead;
-                            while ((bytesRead = gzis.read(buf)) != -1) {
-                                decompressedBaos.write(buf, 0, bytesRead);
-                            }
-                        }
-                        byte[] xmiBytes = decompressedBaos.toByteArray();
-                        String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8);
-                        JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes));
-                        List<Map<String, Object>> annotations = processJCas(jcas);
-                        Document filter = new Document("RedeID", speechKey);
-                        Document updateFields = new Document()
-                                .append("annotations", annotations)
-                                .append("xmi", xmiContent);
-                        Document update = new Document("$set", updateFields);
-                        UpdateOneModel<Document> updateModel = new UpdateOneModel<>(filter, update);
-                        bulkOperations.add(updateModel);
-                    } catch (Exception e) {
-                        e.printStackTrace();
-                    }
-                }
-                zis.closeEntry();
-            }
-        }
-        if (!bulkOperations.isEmpty()) {
-            mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations);
-            System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente.");
-        }
-        mongoDBHandler.close();
-    }
-
-
-    /**
-     * Liest einen InputStream vollständig in einen String ein.
-     *
-     * @param is InputStream
-     * @return Den gesamten String-Inhalt
-     * @throws IOException falls ein Fehler beim Lesen auftritt
-     */
-    private static String readInputStream(InputStream is) throws IOException {
-        BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
-        StringBuilder sb = new StringBuilder();
-        String line;
-        while ((line = reader.readLine()) != null) {
-            sb.append(line).append("\n");
-        }
-        return sb.toString();
-    }
-
-    /**
-     * Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um.
-     * Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert.
-     *
-     * @param xmiInputStream InputStream des XMI-Inhalts
-     * @return Den konvertierten JCas
-     * @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt
-     */
-    private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception {
-        InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH);
-        if (tsCompressedStream == null) {
-            throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH);
-        }
-        try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) {
-            XMLInputSource inputSource = new XMLInputSource(tsStream);
-            TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource);
-            CAS cas = CasCreationUtils.createCas(tsd, null, null);
-            XmiCasDeserializer.deserialize(xmiInputStream, cas, true);
-            return cas.getJCas();
-        }
-    }
-
-    /**
-     * Iteriert über alle Annotationen im JCas und verarbeitet diese.
-     * Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten.
-     *
-     * @param jcas Der konvertierte JCas
-     */
-    private static List<Map<String, Object>> processJCas(JCas jcas) {
-        List<Map<String, Object>> annotationsData = new ArrayList<>();
-        CAS cas = jcas.getCas();
-        AnnotationIndex<AnnotationFS> index = cas.getAnnotationIndex();
-        for (AnnotationFS annotation : index) {
-            Map<String, Object> annotationData = new HashMap<>();
-            annotationData.put("type", annotation.getType().getName());
-            Map<String, String> featuresMap = new HashMap<>();
-            for (Feature feature : annotation.getType().getFeatures()) {
-                String featureName = feature.getShortName();
-                String featureValue = annotation.getFeatureValueAsString(feature);
-                if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) {
-                    featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency
-                }
-                featuresMap.put(featureName, featureValue);
-                featuresMap.put(featureName, featureValue);
-            }
-            annotationData.put("features", featuresMap);
-
-            annotationsData.add(annotationData);
-        }
-        return annotationsData;
-    }
-
 }