code needs 21718261765125gb ram :D
This commit is contained in:
parent
4af6279324
commit
e2244a4b45
8 changed files with 199 additions and 17 deletions
|
@ -5,6 +5,7 @@ import com.mongodb.client.MongoDatabase;
|
|||
import org.texttechnologylab.project.gruppe_05_1.database.*;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument;
|
||||
import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils;
|
||||
import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.Logger;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
|
||||
|
@ -56,12 +57,13 @@ public class Main {
|
|||
|
||||
//TEST
|
||||
|
||||
Logger.pink("Parsing XML and inserting data into DB (Uebung 2)...");
|
||||
|
||||
SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl();
|
||||
if (MongoPprUtils.getSpeechCollection().countDocuments() != 0) {
|
||||
System.out.println("Speeches werden nicht gelesen, da sie bereits in der Datenbank stehen");
|
||||
}
|
||||
else {
|
||||
Logger.pink("Parsing XML and inserting data into DB (Uebung 2)...");
|
||||
SpeechIndex speechIndex = speechIndexFactory
|
||||
.parseLegislativePeriods(TRUE)
|
||||
.builder()
|
||||
|
@ -91,10 +93,6 @@ public class Main {
|
|||
mongoDBHandler.close(); // Close the connection to the DB
|
||||
}
|
||||
|
||||
|
||||
//TEST
|
||||
|
||||
|
||||
// Stellt fest, dass alle nötigen Datenbank-Collections existieren
|
||||
PPRUtils.ensureCollectionExist();
|
||||
|
||||
|
@ -102,6 +100,9 @@ public class Main {
|
|||
PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory);
|
||||
|
||||
// NLP-Verarbeitung - TODO
|
||||
NlpUtils.importXmiData();
|
||||
|
||||
|
||||
RESTHandler restHandler = new RESTHandler();
|
||||
restHandler.startJavalin();
|
||||
|
||||
|
|
|
@ -7,9 +7,7 @@ import com.mongodb.client.MongoClient;
|
|||
import com.mongodb.client.MongoClients;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import com.mongodb.client.model.Filters;
|
||||
import com.mongodb.client.model.Indexes;
|
||||
import com.mongodb.client.model.Updates;
|
||||
import com.mongodb.client.model.*;
|
||||
import exceptions.AgendaItemNotFoundException;
|
||||
import exceptions.MemberNotFoundException;
|
||||
import exceptions.ServerErrorException;
|
||||
|
@ -537,7 +535,8 @@ public class MongoDBHandler {
|
|||
.append("speechId", speech.getSpeechId())
|
||||
.append("speakerId", speech.getSpeakerId())
|
||||
.append("speakerName", speech.getSpeakerName())
|
||||
.append("fraction", speech.getFraction());
|
||||
.append("fraction", speech.getFraction())
|
||||
.append("speechKey", speech.getSpeechKey());
|
||||
|
||||
// Convert speechContents to a list of Documents
|
||||
List<Document> contentDocuments = new ArrayList<>();
|
||||
|
@ -638,6 +637,12 @@ public class MongoDBHandler {
|
|||
return result;
|
||||
}
|
||||
|
||||
public void updateXmiData(String speechKey, String xmiContent) {
|
||||
speechesCollection.updateOne(
|
||||
Filters.eq("speechKey", speechKey),
|
||||
Updates.set("xmiData", xmiContent)
|
||||
);
|
||||
}
|
||||
|
||||
public void deleteAllDocuments() {
|
||||
speechesCollection.deleteMany(new Document());
|
||||
|
@ -646,6 +651,11 @@ public class MongoDBHandler {
|
|||
//historyCollection.deleteMany(new Document());
|
||||
}
|
||||
|
||||
public void bulkUpdateDocuments(String collectionName, List<WriteModel<Document>> updates) {
|
||||
BulkWriteOptions options = new BulkWriteOptions().ordered(false);
|
||||
speechesCollection.bulkWrite(updates, options);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
mongoClient.close();
|
||||
}
|
||||
|
|
|
@ -15,7 +15,10 @@ public class Speech_MongoDB_Impl extends Speech_File_Impl implements Speech {
|
|||
mongoDocument.getInteger("speechId"),
|
||||
mongoDocument.getInteger("speakerId"),
|
||||
mongoDocument.getString("speakerName"),
|
||||
mongoDocument.getString("fraction"));
|
||||
mongoDocument.getString("fraction"),
|
||||
mongoDocument.getString("speechKey")
|
||||
);
|
||||
|
||||
|
||||
for (Document content : (List<Document>) mongoDocument.get("speechContents")) {
|
||||
switch (content.getString("type")) {
|
||||
|
@ -32,5 +35,6 @@ public class Speech_MongoDB_Impl extends Speech_File_Impl implements Speech {
|
|||
throw new IllegalArgumentException("Unknown content type: " + content.getString("type"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,15 +1,29 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.nlp;
|
||||
|
||||
import com.mongodb.client.model.Filters;
|
||||
import com.mongodb.client.model.UpdateOneModel;
|
||||
import com.mongodb.client.model.Updates;
|
||||
import com.mongodb.client.model.WriteModel;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.uima.UIMAException;
|
||||
import org.apache.uima.UIMAFramework;
|
||||
import org.apache.uima.cas.CAS;
|
||||
import org.apache.uima.cas.CASException;
|
||||
import org.apache.uima.cas.Feature;
|
||||
import org.apache.uima.cas.impl.XmiCasDeserializer;
|
||||
import org.apache.uima.cas.text.AnnotationFS;
|
||||
import org.apache.uima.cas.text.AnnotationIndex;
|
||||
import org.apache.uima.fit.factory.JCasFactory;
|
||||
import org.apache.uima.fit.util.JCasUtil;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.uima.resource.metadata.TypeSystemDescription;
|
||||
import org.apache.uima.util.CasCreationUtils;
|
||||
import org.apache.uima.util.XMLInputSource;
|
||||
import org.bson.Document;
|
||||
import org.dkpro.core.io.xmi.XmiWriter;
|
||||
import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer;
|
||||
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIDockerDriver;
|
||||
|
@ -17,25 +31,31 @@ import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver;
|
|||
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver;
|
||||
import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext;
|
||||
import org.texttechnologylab.annotation.NamedEntity;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
|
||||
import org.texttechnologylab.uima.type.Sentiment;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.*;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Base64;
|
||||
import java.util.Collection;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
|
||||
import static org.springframework.core.io.buffer.DataBufferUtils.readInputStream;
|
||||
|
||||
public class NlpUtils {
|
||||
|
||||
// common class-attributes
|
||||
private static DUUIComposer pComposer = null;
|
||||
private static int iWorkers = 1;
|
||||
private static final String TYPE_SYSTEM_DESCRIPTOR_PATH = "/speeches/TypeSystem.xml.gz";
|
||||
private static final int MAX_FEATURE_LENGTH = 10000;
|
||||
|
||||
|
||||
public static void createNlpData() {
|
||||
|
@ -310,4 +330,137 @@ public class NlpUtils {
|
|||
private static void createSentimentInfo() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip),
|
||||
* iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um,
|
||||
* verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags).
|
||||
* Anschließend wird das jeweilige Dokument in MongoDB aktualisiert.
|
||||
*
|
||||
* @throws IOException falls ein Fehler beim Dateizugriff auftritt
|
||||
*/
|
||||
public static void importXmiData() throws IOException {
|
||||
MongoDBHandler mongoDBHandler = new MongoDBHandler();
|
||||
List<WriteModel<Document>> bulkOperations = new ArrayList<>();
|
||||
InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip");
|
||||
if (zipStream == null) {
|
||||
throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches");
|
||||
}
|
||||
try (ZipInputStream zis = new ZipInputStream(zipStream)) {
|
||||
ZipEntry entry;
|
||||
while ((entry = zis.getNextEntry()) != null) {
|
||||
if (!entry.isDirectory() && entry.getName().endsWith(".gz")) {
|
||||
String speechKey = entry.getName().substring(0, entry.getName().length() - 3);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[4096];
|
||||
int len;
|
||||
while ((len = zis.read(buffer)) != -1) {
|
||||
baos.write(buffer, 0, len);
|
||||
}
|
||||
byte[] gzData = baos.toByteArray();
|
||||
try {
|
||||
|
||||
ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream();
|
||||
try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) {
|
||||
byte[] buf = new byte[4096];
|
||||
int bytesRead;
|
||||
while ((bytesRead = gzis.read(buf)) != -1) {
|
||||
decompressedBaos.write(buf, 0, bytesRead);
|
||||
}
|
||||
}
|
||||
byte[] xmiBytes = decompressedBaos.toByteArray();
|
||||
String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8);
|
||||
JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes));
|
||||
List<Map<String, Object>> annotations = processJCas(jcas);
|
||||
Document filter = new Document("RedeID", speechKey);
|
||||
Document updateFields = new Document()
|
||||
.append("annotations", annotations)
|
||||
.append("xmi", xmiContent);
|
||||
Document update = new Document("$set", updateFields);
|
||||
UpdateOneModel<Document> updateModel = new UpdateOneModel<>(filter, update);
|
||||
bulkOperations.add(updateModel);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
zis.closeEntry();
|
||||
}
|
||||
}
|
||||
if (!bulkOperations.isEmpty()) {
|
||||
mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations);
|
||||
System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente.");
|
||||
}
|
||||
mongoDBHandler.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Liest einen InputStream vollständig in einen String ein.
|
||||
*
|
||||
* @param is InputStream
|
||||
* @return Den gesamten String-Inhalt
|
||||
* @throws IOException falls ein Fehler beim Lesen auftritt
|
||||
*/
|
||||
private static String readInputStream(InputStream is) throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
sb.append(line).append("\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um.
|
||||
* Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert.
|
||||
*
|
||||
* @param xmiInputStream InputStream des XMI-Inhalts
|
||||
* @return Den konvertierten JCas
|
||||
* @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt
|
||||
*/
|
||||
private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception {
|
||||
InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH);
|
||||
if (tsCompressedStream == null) {
|
||||
throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH);
|
||||
}
|
||||
try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) {
|
||||
XMLInputSource inputSource = new XMLInputSource(tsStream);
|
||||
TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource);
|
||||
CAS cas = CasCreationUtils.createCas(tsd, null, null);
|
||||
XmiCasDeserializer.deserialize(xmiInputStream, cas, true);
|
||||
return cas.getJCas();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iteriert über alle Annotationen im JCas und verarbeitet diese.
|
||||
* Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten.
|
||||
*
|
||||
* @param jcas Der konvertierte JCas
|
||||
*/
|
||||
private static List<Map<String, Object>> processJCas(JCas jcas) {
|
||||
List<Map<String, Object>> annotationsData = new ArrayList<>();
|
||||
CAS cas = jcas.getCas();
|
||||
AnnotationIndex<AnnotationFS> index = cas.getAnnotationIndex();
|
||||
for (AnnotationFS annotation : index) {
|
||||
Map<String, Object> annotationData = new HashMap<>();
|
||||
annotationData.put("type", annotation.getType().getName());
|
||||
Map<String, String> featuresMap = new HashMap<>();
|
||||
for (Feature feature : annotation.getType().getFeatures()) {
|
||||
String featureName = feature.getShortName();
|
||||
String featureValue = annotation.getFeatureValueAsString(feature);
|
||||
if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) {
|
||||
featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency
|
||||
}
|
||||
featuresMap.put(featureName, featureValue);
|
||||
featuresMap.put(featureName, featureValue);
|
||||
}
|
||||
annotationData.put("features", featuresMap);
|
||||
|
||||
annotationsData.add(annotationData);
|
||||
}
|
||||
return annotationsData;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ public class Speech_File_Impl implements Speech {
|
|||
private final String speakerName;
|
||||
private final String fraction;
|
||||
private final List<Content> speechContents;
|
||||
private final String speechKey;
|
||||
|
||||
@Override
|
||||
public int getSessionId() {
|
||||
|
@ -53,13 +54,19 @@ public class Speech_File_Impl implements Speech {
|
|||
return speechContents;
|
||||
}
|
||||
|
||||
public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction) {
|
||||
@Override
|
||||
public String getSpeechKey() {
|
||||
return speechKey;
|
||||
}
|
||||
|
||||
public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction, String speechKey) {
|
||||
this.speakerId = speakerId;
|
||||
this.agendaItemId = agendaItemId;
|
||||
this.speechId = speechId;
|
||||
this.speakerName = speakerName;
|
||||
this.fraction = fraction;
|
||||
this.sessionId = sessionId;
|
||||
this.speechKey = speechKey;
|
||||
this.speechContents = new ArrayList<>();
|
||||
}
|
||||
|
||||
|
|
|
@ -54,6 +54,13 @@ public interface Speech {
|
|||
*/
|
||||
int getSessionId();
|
||||
|
||||
/**
|
||||
* Returns the session Key.
|
||||
*
|
||||
* @return The session Key.
|
||||
*/
|
||||
String getSpeechKey();
|
||||
|
||||
/**
|
||||
* Returns the speech contents.
|
||||
*
|
||||
|
|
|
@ -112,14 +112,14 @@ public class SpeechParser {
|
|||
int speakerId = Integer.parseInt(speakerElement.getAttribute("id"));
|
||||
Element nameElement = (Element) speakerElement.getElementsByTagName("name").item(0);
|
||||
if (nameElement == null) continue;
|
||||
|
||||
String redeID = speechElement.getAttribute("id");
|
||||
String title = getOptionalTextContent(nameElement, "titel");
|
||||
String firstName = getOptionalTextContent(nameElement, "vorname");
|
||||
String lastName = getOptionalTextContent(nameElement, "nachname");
|
||||
String fraction = getOptionalTextContent(nameElement, "fraktion");
|
||||
|
||||
String speakerName = (title != null ? title + " " : "") + firstName + " " + lastName;
|
||||
Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction);
|
||||
Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction, redeID);
|
||||
|
||||
// Add the speaker to speech contents
|
||||
speech.addContent(new Speaker_File_Impl(0, speechId, speakerId, speakerName, fraction));
|
||||
|
|
BIN
src/main/resources/speeches/TypeSystem.xml.gz
Normal file
BIN
src/main/resources/speeches/TypeSystem.xml.gz
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue