code needs 21718261765125gb ram :D

This commit is contained in:
Picman2000 2025-03-06 13:05:48 +01:00
parent 4af6279324
commit e2244a4b45
8 changed files with 199 additions and 17 deletions

View file

@ -5,6 +5,7 @@ import com.mongodb.client.MongoDatabase;
import org.texttechnologylab.project.gruppe_05_1.database.*;
import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb;
import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument;
import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils;
import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler;
import org.texttechnologylab.project.gruppe_05_1.util.Logger;
import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
@ -56,12 +57,13 @@ public class Main {
//TEST
Logger.pink("Parsing XML and inserting data into DB (Uebung 2)...");
SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl();
if (MongoPprUtils.getSpeechCollection().countDocuments() != 0) {
System.out.println("Speeches werden nicht gelesen, da sie bereits in der Datenbank stehen");
}
else {
Logger.pink("Parsing XML and inserting data into DB (Uebung 2)...");
SpeechIndex speechIndex = speechIndexFactory
.parseLegislativePeriods(TRUE)
.builder()
@ -91,10 +93,6 @@ public class Main {
mongoDBHandler.close(); // Close the connection to the DB
}
//TEST
// Stellt fest, dass alle nötigen Datenbank-Collections existieren
PPRUtils.ensureCollectionExist();
@ -102,6 +100,9 @@ public class Main {
PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory);
// NLP-Verarbeitung - TODO
NlpUtils.importXmiData();
RESTHandler restHandler = new RESTHandler();
restHandler.startJavalin();

View file

@ -7,9 +7,7 @@ import com.mongodb.client.MongoClient;
import com.mongodb.client.MongoClients;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.Indexes;
import com.mongodb.client.model.Updates;
import com.mongodb.client.model.*;
import exceptions.AgendaItemNotFoundException;
import exceptions.MemberNotFoundException;
import exceptions.ServerErrorException;
@ -537,7 +535,8 @@ public class MongoDBHandler {
.append("speechId", speech.getSpeechId())
.append("speakerId", speech.getSpeakerId())
.append("speakerName", speech.getSpeakerName())
.append("fraction", speech.getFraction());
.append("fraction", speech.getFraction())
.append("speechKey", speech.getSpeechKey());
// Convert speechContents to a list of Documents
List<Document> contentDocuments = new ArrayList<>();
@ -638,6 +637,12 @@ public class MongoDBHandler {
return result;
}
public void updateXmiData(String speechKey, String xmiContent) {
speechesCollection.updateOne(
Filters.eq("speechKey", speechKey),
Updates.set("xmiData", xmiContent)
);
}
public void deleteAllDocuments() {
speechesCollection.deleteMany(new Document());
@ -646,6 +651,11 @@ public class MongoDBHandler {
//historyCollection.deleteMany(new Document());
}
public void bulkUpdateDocuments(String collectionName, List<WriteModel<Document>> updates) {
BulkWriteOptions options = new BulkWriteOptions().ordered(false);
speechesCollection.bulkWrite(updates, options);
}
public void close() {
mongoClient.close();
}

View file

@ -15,7 +15,10 @@ public class Speech_MongoDB_Impl extends Speech_File_Impl implements Speech {
mongoDocument.getInteger("speechId"),
mongoDocument.getInteger("speakerId"),
mongoDocument.getString("speakerName"),
mongoDocument.getString("fraction"));
mongoDocument.getString("fraction"),
mongoDocument.getString("speechKey")
);
for (Document content : (List<Document>) mongoDocument.get("speechContents")) {
switch (content.getString("type")) {
@ -32,5 +35,6 @@ public class Speech_MongoDB_Impl extends Speech_File_Impl implements Speech {
throw new IllegalArgumentException("Unknown content type: " + content.getString("type"));
}
}
}
}

View file

@ -1,15 +1,29 @@
package org.texttechnologylab.project.gruppe_05_1.nlp;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.UpdateOneModel;
import com.mongodb.client.model.Updates;
import com.mongodb.client.model.WriteModel;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.UIMAFramework;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.XMLInputSource;
import org.bson.Document;
import org.dkpro.core.io.xmi.XmiWriter;
import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer;
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIDockerDriver;
@ -17,25 +31,31 @@ import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver;
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver;
import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext;
import org.texttechnologylab.annotation.NamedEntity;
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
import org.texttechnologylab.uima.type.Sentiment;
import org.xml.sax.SAXException;
import java.io.File;
import java.io.IOException;
import java.io.*;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Base64;
import java.util.Collection;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.springframework.core.io.buffer.DataBufferUtils.readInputStream;
public class NlpUtils {
// common class-attributes
private static DUUIComposer pComposer = null;
private static int iWorkers = 1;
private static final String TYPE_SYSTEM_DESCRIPTOR_PATH = "/speeches/TypeSystem.xml.gz";
private static final int MAX_FEATURE_LENGTH = 10000;
public static void createNlpData() {
@ -310,4 +330,137 @@ public class NlpUtils {
private static void createSentimentInfo() {
}
/**
* Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip),
* iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um,
* verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags).
* Anschließend wird das jeweilige Dokument in MongoDB aktualisiert.
*
* @throws IOException falls ein Fehler beim Dateizugriff auftritt
*/
public static void importXmiData() throws IOException {
MongoDBHandler mongoDBHandler = new MongoDBHandler();
List<WriteModel<Document>> bulkOperations = new ArrayList<>();
InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip");
if (zipStream == null) {
throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches");
}
try (ZipInputStream zis = new ZipInputStream(zipStream)) {
ZipEntry entry;
while ((entry = zis.getNextEntry()) != null) {
if (!entry.isDirectory() && entry.getName().endsWith(".gz")) {
String speechKey = entry.getName().substring(0, entry.getName().length() - 3);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buffer = new byte[4096];
int len;
while ((len = zis.read(buffer)) != -1) {
baos.write(buffer, 0, len);
}
byte[] gzData = baos.toByteArray();
try {
ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream();
try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) {
byte[] buf = new byte[4096];
int bytesRead;
while ((bytesRead = gzis.read(buf)) != -1) {
decompressedBaos.write(buf, 0, bytesRead);
}
}
byte[] xmiBytes = decompressedBaos.toByteArray();
String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8);
JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes));
List<Map<String, Object>> annotations = processJCas(jcas);
Document filter = new Document("RedeID", speechKey);
Document updateFields = new Document()
.append("annotations", annotations)
.append("xmi", xmiContent);
Document update = new Document("$set", updateFields);
UpdateOneModel<Document> updateModel = new UpdateOneModel<>(filter, update);
bulkOperations.add(updateModel);
} catch (Exception e) {
e.printStackTrace();
}
}
zis.closeEntry();
}
}
if (!bulkOperations.isEmpty()) {
mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations);
System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente.");
}
mongoDBHandler.close();
}
/**
* Liest einen InputStream vollständig in einen String ein.
*
* @param is InputStream
* @return Den gesamten String-Inhalt
* @throws IOException falls ein Fehler beim Lesen auftritt
*/
private static String readInputStream(InputStream is) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
StringBuilder sb = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
sb.append(line).append("\n");
}
return sb.toString();
}
/**
* Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um.
* Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert.
*
* @param xmiInputStream InputStream des XMI-Inhalts
* @return Den konvertierten JCas
* @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt
*/
private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception {
InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH);
if (tsCompressedStream == null) {
throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH);
}
try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) {
XMLInputSource inputSource = new XMLInputSource(tsStream);
TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource);
CAS cas = CasCreationUtils.createCas(tsd, null, null);
XmiCasDeserializer.deserialize(xmiInputStream, cas, true);
return cas.getJCas();
}
}
/**
* Iteriert über alle Annotationen im JCas und verarbeitet diese.
* Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten.
*
* @param jcas Der konvertierte JCas
*/
private static List<Map<String, Object>> processJCas(JCas jcas) {
List<Map<String, Object>> annotationsData = new ArrayList<>();
CAS cas = jcas.getCas();
AnnotationIndex<AnnotationFS> index = cas.getAnnotationIndex();
for (AnnotationFS annotation : index) {
Map<String, Object> annotationData = new HashMap<>();
annotationData.put("type", annotation.getType().getName());
Map<String, String> featuresMap = new HashMap<>();
for (Feature feature : annotation.getType().getFeatures()) {
String featureName = feature.getShortName();
String featureValue = annotation.getFeatureValueAsString(feature);
if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) {
featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency
}
featuresMap.put(featureName, featureValue);
featuresMap.put(featureName, featureValue);
}
annotationData.put("features", featuresMap);
annotationsData.add(annotationData);
}
return annotationsData;
}
}

View file

@ -17,6 +17,7 @@ public class Speech_File_Impl implements Speech {
private final String speakerName;
private final String fraction;
private final List<Content> speechContents;
private final String speechKey;
@Override
public int getSessionId() {
@ -53,13 +54,19 @@ public class Speech_File_Impl implements Speech {
return speechContents;
}
public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction) {
@Override
public String getSpeechKey() {
return speechKey;
}
public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction, String speechKey) {
this.speakerId = speakerId;
this.agendaItemId = agendaItemId;
this.speechId = speechId;
this.speakerName = speakerName;
this.fraction = fraction;
this.sessionId = sessionId;
this.speechKey = speechKey;
this.speechContents = new ArrayList<>();
}

View file

@ -54,6 +54,13 @@ public interface Speech {
*/
int getSessionId();
/**
* Returns the session Key.
*
* @return The session Key.
*/
String getSpeechKey();
/**
* Returns the speech contents.
*

View file

@ -112,14 +112,14 @@ public class SpeechParser {
int speakerId = Integer.parseInt(speakerElement.getAttribute("id"));
Element nameElement = (Element) speakerElement.getElementsByTagName("name").item(0);
if (nameElement == null) continue;
String redeID = speechElement.getAttribute("id");
String title = getOptionalTextContent(nameElement, "titel");
String firstName = getOptionalTextContent(nameElement, "vorname");
String lastName = getOptionalTextContent(nameElement, "nachname");
String fraction = getOptionalTextContent(nameElement, "fraktion");
String speakerName = (title != null ? title + " " : "") + firstName + " " + lastName;
Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction);
Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction, redeID);
// Add the speaker to speech contents
speech.addContent(new Speaker_File_Impl(0, speechId, speakerId, speakerName, fraction));

Binary file not shown.