diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java index 02ccfac..21d0b64 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java @@ -6,6 +6,7 @@ import org.texttechnologylab.project.gruppe_05_1.database.*; import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb; import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument; import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils; +import org.texttechnologylab.project.gruppe_05_1.nlp.XmiExtractor; import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler; import org.texttechnologylab.project.gruppe_05_1.util.Logger; import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils; @@ -99,8 +100,10 @@ public class Main { // Alle Informationen (Parlamentarier, Reden, Kommentare etc.) lesen und in die Mongo-DB einfügen, falls diese noch nicht vorhanden sind. PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory); + // Upload der analysierten XMI dateien + // NLP-Verarbeitung - TODO - NlpUtils.importXmiData(); + // NlpUtils.importXmiData(); RESTHandler restHandler = new RESTHandler(); diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java index d892fc5..f23152c 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java @@ -94,6 +94,10 @@ public class MongoDBHandler { } + public MongoDatabase getDatabase() { + return this.database; + } + /** * Get the MongoDB according to properties. * If a local server URI is defined, use it. Otherwise, use remote server. diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java index 27f1dea..f498b9a 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java @@ -331,136 +331,4 @@ public class NlpUtils { } - /** - * Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip), - * iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um, - * verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags). - * Anschließend wird das jeweilige Dokument in MongoDB aktualisiert. - * - * @throws IOException falls ein Fehler beim Dateizugriff auftritt - */ - public static void importXmiData() throws IOException { - MongoDBHandler mongoDBHandler = new MongoDBHandler(); - List> bulkOperations = new ArrayList<>(); - InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip"); - if (zipStream == null) { - throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches"); - } - try (ZipInputStream zis = new ZipInputStream(zipStream)) { - ZipEntry entry; - while ((entry = zis.getNextEntry()) != null) { - if (!entry.isDirectory() && entry.getName().endsWith(".gz")) { - String speechKey = entry.getName().substring(0, entry.getName().length() - 3); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[4096]; - int len; - while ((len = zis.read(buffer)) != -1) { - baos.write(buffer, 0, len); - } - byte[] gzData = baos.toByteArray(); - try { - - ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream(); - try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) { - byte[] buf = new byte[4096]; - int bytesRead; - while ((bytesRead = gzis.read(buf)) != -1) { - decompressedBaos.write(buf, 0, bytesRead); - } - } - byte[] xmiBytes = decompressedBaos.toByteArray(); - String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8); - JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes)); - List> annotations = processJCas(jcas); - Document filter = new Document("RedeID", speechKey); - Document updateFields = new Document() - .append("annotations", annotations) - .append("xmi", xmiContent); - Document update = new Document("$set", updateFields); - UpdateOneModel updateModel = new UpdateOneModel<>(filter, update); - bulkOperations.add(updateModel); - } catch (Exception e) { - e.printStackTrace(); - } - } - zis.closeEntry(); - } - } - if (!bulkOperations.isEmpty()) { - mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations); - System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente."); - } - mongoDBHandler.close(); - } - - - /** - * Liest einen InputStream vollständig in einen String ein. - * - * @param is InputStream - * @return Den gesamten String-Inhalt - * @throws IOException falls ein Fehler beim Lesen auftritt - */ - private static String readInputStream(InputStream is) throws IOException { - BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); - StringBuilder sb = new StringBuilder(); - String line; - while ((line = reader.readLine()) != null) { - sb.append(line).append("\n"); - } - return sb.toString(); - } - - /** - * Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um. - * Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert. - * - * @param xmiInputStream InputStream des XMI-Inhalts - * @return Den konvertierten JCas - * @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt - */ - private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception { - InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH); - if (tsCompressedStream == null) { - throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH); - } - try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) { - XMLInputSource inputSource = new XMLInputSource(tsStream); - TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource); - CAS cas = CasCreationUtils.createCas(tsd, null, null); - XmiCasDeserializer.deserialize(xmiInputStream, cas, true); - return cas.getJCas(); - } - } - - /** - * Iteriert über alle Annotationen im JCas und verarbeitet diese. - * Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten. - * - * @param jcas Der konvertierte JCas - */ - private static List> processJCas(JCas jcas) { - List> annotationsData = new ArrayList<>(); - CAS cas = jcas.getCas(); - AnnotationIndex index = cas.getAnnotationIndex(); - for (AnnotationFS annotation : index) { - Map annotationData = new HashMap<>(); - annotationData.put("type", annotation.getType().getName()); - Map featuresMap = new HashMap<>(); - for (Feature feature : annotation.getType().getFeatures()) { - String featureName = feature.getShortName(); - String featureValue = annotation.getFeatureValueAsString(feature); - if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) { - featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency - } - featuresMap.put(featureName, featureValue); - featuresMap.put(featureName, featureValue); - } - annotationData.put("features", featuresMap); - - annotationsData.add(annotationData); - } - return annotationsData; - } - }