diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java index 6a14729..02ccfac 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java @@ -5,6 +5,7 @@ import com.mongodb.client.MongoDatabase; import org.texttechnologylab.project.gruppe_05_1.database.*; import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb; import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument; +import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils; import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler; import org.texttechnologylab.project.gruppe_05_1.util.Logger; import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils; @@ -56,12 +57,13 @@ public class Main { //TEST - Logger.pink("Parsing XML and inserting data into DB (Uebung 2)..."); + SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl(); if (MongoPprUtils.getSpeechCollection().countDocuments() != 0) { System.out.println("Speeches werden nicht gelesen, da sie bereits in der Datenbank stehen"); } else { + Logger.pink("Parsing XML and inserting data into DB (Uebung 2)..."); SpeechIndex speechIndex = speechIndexFactory .parseLegislativePeriods(TRUE) .builder() @@ -91,10 +93,6 @@ public class Main { mongoDBHandler.close(); // Close the connection to the DB } - - //TEST - - // Stellt fest, dass alle nötigen Datenbank-Collections existieren PPRUtils.ensureCollectionExist(); @@ -102,6 +100,9 @@ public class Main { PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory); // NLP-Verarbeitung - TODO + NlpUtils.importXmiData(); + + RESTHandler restHandler = new RESTHandler(); restHandler.startJavalin(); diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java index 8361a70..d892fc5 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java @@ -7,9 +7,7 @@ import com.mongodb.client.MongoClient; import com.mongodb.client.MongoClients; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; -import com.mongodb.client.model.Filters; -import com.mongodb.client.model.Indexes; -import com.mongodb.client.model.Updates; +import com.mongodb.client.model.*; import exceptions.AgendaItemNotFoundException; import exceptions.MemberNotFoundException; import exceptions.ServerErrorException; @@ -537,7 +535,8 @@ public class MongoDBHandler { .append("speechId", speech.getSpeechId()) .append("speakerId", speech.getSpeakerId()) .append("speakerName", speech.getSpeakerName()) - .append("fraction", speech.getFraction()); + .append("fraction", speech.getFraction()) + .append("speechKey", speech.getSpeechKey()); // Convert speechContents to a list of Documents List contentDocuments = new ArrayList<>(); @@ -638,6 +637,12 @@ public class MongoDBHandler { return result; } + public void updateXmiData(String speechKey, String xmiContent) { + speechesCollection.updateOne( + Filters.eq("speechKey", speechKey), + Updates.set("xmiData", xmiContent) + ); + } public void deleteAllDocuments() { speechesCollection.deleteMany(new Document()); @@ -646,6 +651,11 @@ public class MongoDBHandler { //historyCollection.deleteMany(new Document()); } + public void bulkUpdateDocuments(String collectionName, List> updates) { + BulkWriteOptions options = new BulkWriteOptions().ordered(false); + speechesCollection.bulkWrite(updates, options); + } + public void close() { mongoClient.close(); } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/domainimp/speeches/Speech_MongoDB_Impl.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/domainimp/speeches/Speech_MongoDB_Impl.java index 0ac537c..5f2fbb3 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/domainimp/speeches/Speech_MongoDB_Impl.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/domainimp/speeches/Speech_MongoDB_Impl.java @@ -15,7 +15,10 @@ public class Speech_MongoDB_Impl extends Speech_File_Impl implements Speech { mongoDocument.getInteger("speechId"), mongoDocument.getInteger("speakerId"), mongoDocument.getString("speakerName"), - mongoDocument.getString("fraction")); + mongoDocument.getString("fraction"), + mongoDocument.getString("speechKey") + ); + for (Document content : (List) mongoDocument.get("speechContents")) { switch (content.getString("type")) { @@ -32,5 +35,6 @@ public class Speech_MongoDB_Impl extends Speech_File_Impl implements Speech { throw new IllegalArgumentException("Unknown content type: " + content.getString("type")); } } + } } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java index 7b80788..27f1dea 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java @@ -1,15 +1,29 @@ package org.texttechnologylab.project.gruppe_05_1.nlp; +import com.mongodb.client.model.Filters; +import com.mongodb.client.model.UpdateOneModel; +import com.mongodb.client.model.Updates; +import com.mongodb.client.model.WriteModel; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; +import org.apache.uima.UIMAFramework; +import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.impl.XmiCasDeserializer; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.CasCreationUtils; +import org.apache.uima.util.XMLInputSource; +import org.bson.Document; import org.dkpro.core.io.xmi.XmiWriter; import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIDockerDriver; @@ -17,25 +31,31 @@ import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver; import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; import org.texttechnologylab.annotation.NamedEntity; +import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler; import org.texttechnologylab.uima.type.Sentiment; import org.xml.sax.SAXException; -import java.io.File; -import java.io.IOException; +import java.io.*; import java.net.URISyntaxException; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Base64; -import java.util.Collection; +import java.util.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.springframework.core.io.buffer.DataBufferUtils.readInputStream; public class NlpUtils { // common class-attributes private static DUUIComposer pComposer = null; private static int iWorkers = 1; + private static final String TYPE_SYSTEM_DESCRIPTOR_PATH = "/speeches/TypeSystem.xml.gz"; + private static final int MAX_FEATURE_LENGTH = 10000; public static void createNlpData() { @@ -310,4 +330,137 @@ public class NlpUtils { private static void createSentimentInfo() { } + + /** + * Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip), + * iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um, + * verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags). + * Anschließend wird das jeweilige Dokument in MongoDB aktualisiert. + * + * @throws IOException falls ein Fehler beim Dateizugriff auftritt + */ + public static void importXmiData() throws IOException { + MongoDBHandler mongoDBHandler = new MongoDBHandler(); + List> bulkOperations = new ArrayList<>(); + InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip"); + if (zipStream == null) { + throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches"); + } + try (ZipInputStream zis = new ZipInputStream(zipStream)) { + ZipEntry entry; + while ((entry = zis.getNextEntry()) != null) { + if (!entry.isDirectory() && entry.getName().endsWith(".gz")) { + String speechKey = entry.getName().substring(0, entry.getName().length() - 3); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[4096]; + int len; + while ((len = zis.read(buffer)) != -1) { + baos.write(buffer, 0, len); + } + byte[] gzData = baos.toByteArray(); + try { + + ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream(); + try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) { + byte[] buf = new byte[4096]; + int bytesRead; + while ((bytesRead = gzis.read(buf)) != -1) { + decompressedBaos.write(buf, 0, bytesRead); + } + } + byte[] xmiBytes = decompressedBaos.toByteArray(); + String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8); + JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes)); + List> annotations = processJCas(jcas); + Document filter = new Document("RedeID", speechKey); + Document updateFields = new Document() + .append("annotations", annotations) + .append("xmi", xmiContent); + Document update = new Document("$set", updateFields); + UpdateOneModel updateModel = new UpdateOneModel<>(filter, update); + bulkOperations.add(updateModel); + } catch (Exception e) { + e.printStackTrace(); + } + } + zis.closeEntry(); + } + } + if (!bulkOperations.isEmpty()) { + mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations); + System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente."); + } + mongoDBHandler.close(); + } + + + /** + * Liest einen InputStream vollständig in einen String ein. + * + * @param is InputStream + * @return Den gesamten String-Inhalt + * @throws IOException falls ein Fehler beim Lesen auftritt + */ + private static String readInputStream(InputStream is) throws IOException { + BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); + StringBuilder sb = new StringBuilder(); + String line; + while ((line = reader.readLine()) != null) { + sb.append(line).append("\n"); + } + return sb.toString(); + } + + /** + * Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um. + * Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert. + * + * @param xmiInputStream InputStream des XMI-Inhalts + * @return Den konvertierten JCas + * @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt + */ + private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception { + InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH); + if (tsCompressedStream == null) { + throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH); + } + try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) { + XMLInputSource inputSource = new XMLInputSource(tsStream); + TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource); + CAS cas = CasCreationUtils.createCas(tsd, null, null); + XmiCasDeserializer.deserialize(xmiInputStream, cas, true); + return cas.getJCas(); + } + } + + /** + * Iteriert über alle Annotationen im JCas und verarbeitet diese. + * Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten. + * + * @param jcas Der konvertierte JCas + */ + private static List> processJCas(JCas jcas) { + List> annotationsData = new ArrayList<>(); + CAS cas = jcas.getCas(); + AnnotationIndex index = cas.getAnnotationIndex(); + for (AnnotationFS annotation : index) { + Map annotationData = new HashMap<>(); + annotationData.put("type", annotation.getType().getName()); + Map featuresMap = new HashMap<>(); + for (Feature feature : annotation.getType().getFeatures()) { + String featureName = feature.getShortName(); + String featureValue = annotation.getFeatureValueAsString(feature); + if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) { + featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency + } + featuresMap.put(featureName, featureValue); + featuresMap.put(featureName, featureValue); + } + annotationData.put("features", featuresMap); + + annotationsData.add(annotationData); + } + return annotationsData; + } + } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Impls/Speech_File_Impl.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Impls/Speech_File_Impl.java index 739a909..2d1dff8 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Impls/Speech_File_Impl.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Impls/Speech_File_Impl.java @@ -17,6 +17,7 @@ public class Speech_File_Impl implements Speech { private final String speakerName; private final String fraction; private final List speechContents; + private final String speechKey; @Override public int getSessionId() { @@ -53,13 +54,19 @@ public class Speech_File_Impl implements Speech { return speechContents; } - public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction) { + @Override + public String getSpeechKey() { + return speechKey; + } + + public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction, String speechKey) { this.speakerId = speakerId; this.agendaItemId = agendaItemId; this.speechId = speechId; this.speakerName = speakerName; this.fraction = fraction; this.sessionId = sessionId; + this.speechKey = speechKey; this.speechContents = new ArrayList<>(); } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Interfaces/Speech.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Interfaces/Speech.java index 786741e..9445234 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Interfaces/Speech.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/Interfaces/Speech.java @@ -54,6 +54,13 @@ public interface Speech { */ int getSessionId(); + /** + * Returns the session Key. + * + * @return The session Key. + */ + String getSpeechKey(); + /** * Returns the speech contents. * diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java index d09eee1..124d359 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java @@ -112,14 +112,14 @@ public class SpeechParser { int speakerId = Integer.parseInt(speakerElement.getAttribute("id")); Element nameElement = (Element) speakerElement.getElementsByTagName("name").item(0); if (nameElement == null) continue; - + String redeID = speechElement.getAttribute("id"); String title = getOptionalTextContent(nameElement, "titel"); String firstName = getOptionalTextContent(nameElement, "vorname"); String lastName = getOptionalTextContent(nameElement, "nachname"); String fraction = getOptionalTextContent(nameElement, "fraktion"); String speakerName = (title != null ? title + " " : "") + firstName + " " + lastName; - Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction); + Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction, redeID); // Add the speaker to speech contents speech.addContent(new Speaker_File_Impl(0, speechId, speakerId, speakerName, fraction)); diff --git a/src/main/resources/speeches/TypeSystem.xml.gz b/src/main/resources/speeches/TypeSystem.xml.gz new file mode 100644 index 0000000..3d45d40 Binary files /dev/null and b/src/main/resources/speeches/TypeSystem.xml.gz differ