Implemented an XMI extractor for extracting and uploading already analized data in given files.
This commit is contained in:
parent
e2244a4b45
commit
7113637f00
3 changed files with 8 additions and 133 deletions
|
@ -6,6 +6,7 @@ import org.texttechnologylab.project.gruppe_05_1.database.*;
|
|||
import org.texttechnologylab.project.gruppe_05_1.domain.mdb.Mdb;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.mdb.MdbDocument;
|
||||
import org.texttechnologylab.project.gruppe_05_1.nlp.NlpUtils;
|
||||
import org.texttechnologylab.project.gruppe_05_1.nlp.XmiExtractor;
|
||||
import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.Logger;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
|
||||
|
@ -99,8 +100,10 @@ public class Main {
|
|||
// Alle Informationen (Parlamentarier, Reden, Kommentare etc.) lesen und in die Mongo-DB einfügen, falls diese noch nicht vorhanden sind.
|
||||
PPRUtils.parlamentExplorerInit(xmlFactory, mongoFactory);
|
||||
|
||||
// Upload der analysierten XMI dateien
|
||||
|
||||
// NLP-Verarbeitung - TODO
|
||||
NlpUtils.importXmiData();
|
||||
// NlpUtils.importXmiData();
|
||||
|
||||
|
||||
RESTHandler restHandler = new RESTHandler();
|
||||
|
|
|
@ -94,6 +94,10 @@ public class MongoDBHandler {
|
|||
|
||||
}
|
||||
|
||||
public MongoDatabase getDatabase() {
|
||||
return this.database;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the MongoDB according to properties.
|
||||
* If a local server URI is defined, use it. Otherwise, use remote server.
|
||||
|
|
|
@ -331,136 +331,4 @@ public class NlpUtils {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Liest die ZIP-Datei aus dem Ressourcenordner (/speeches/20.zip),
|
||||
* iteriert über alle .gz-Dateien, dekomprimiert sie, wandelt den XMI-Inhalt in einen JCas um,
|
||||
* verarbeitet den JCas (iteriert über alle Annotationen) und extrahiert z. B. NLP-Daten (z. B. POS-Tags).
|
||||
* Anschließend wird das jeweilige Dokument in MongoDB aktualisiert.
|
||||
*
|
||||
* @throws IOException falls ein Fehler beim Dateizugriff auftritt
|
||||
*/
|
||||
public static void importXmiData() throws IOException {
|
||||
MongoDBHandler mongoDBHandler = new MongoDBHandler();
|
||||
List<WriteModel<Document>> bulkOperations = new ArrayList<>();
|
||||
InputStream zipStream = NlpUtils.class.getResourceAsStream("/speeches/20.zip");
|
||||
if (zipStream == null) {
|
||||
throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches");
|
||||
}
|
||||
try (ZipInputStream zis = new ZipInputStream(zipStream)) {
|
||||
ZipEntry entry;
|
||||
while ((entry = zis.getNextEntry()) != null) {
|
||||
if (!entry.isDirectory() && entry.getName().endsWith(".gz")) {
|
||||
String speechKey = entry.getName().substring(0, entry.getName().length() - 3);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[4096];
|
||||
int len;
|
||||
while ((len = zis.read(buffer)) != -1) {
|
||||
baos.write(buffer, 0, len);
|
||||
}
|
||||
byte[] gzData = baos.toByteArray();
|
||||
try {
|
||||
|
||||
ByteArrayOutputStream decompressedBaos = new ByteArrayOutputStream();
|
||||
try (GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzData))) {
|
||||
byte[] buf = new byte[4096];
|
||||
int bytesRead;
|
||||
while ((bytesRead = gzis.read(buf)) != -1) {
|
||||
decompressedBaos.write(buf, 0, bytesRead);
|
||||
}
|
||||
}
|
||||
byte[] xmiBytes = decompressedBaos.toByteArray();
|
||||
String xmiContent = new String(xmiBytes, StandardCharsets.UTF_8);
|
||||
JCas jcas = convertXmiToJCas(new ByteArrayInputStream(xmiBytes));
|
||||
List<Map<String, Object>> annotations = processJCas(jcas);
|
||||
Document filter = new Document("RedeID", speechKey);
|
||||
Document updateFields = new Document()
|
||||
.append("annotations", annotations)
|
||||
.append("xmi", xmiContent);
|
||||
Document update = new Document("$set", updateFields);
|
||||
UpdateOneModel<Document> updateModel = new UpdateOneModel<>(filter, update);
|
||||
bulkOperations.add(updateModel);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
zis.closeEntry();
|
||||
}
|
||||
}
|
||||
if (!bulkOperations.isEmpty()) {
|
||||
mongoDBHandler.bulkUpdateDocuments("Rede", bulkOperations);
|
||||
System.out.println("Bulk update erfolgreich für " + bulkOperations.size() + " Dokumente.");
|
||||
}
|
||||
mongoDBHandler.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Liest einen InputStream vollständig in einen String ein.
|
||||
*
|
||||
* @param is InputStream
|
||||
* @return Den gesamten String-Inhalt
|
||||
* @throws IOException falls ein Fehler beim Lesen auftritt
|
||||
*/
|
||||
private static String readInputStream(InputStream is) throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
sb.append(line).append("\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Wandelt den XMI-Inhalt aus dem InputStream in einen JCas um.
|
||||
* Dabei wird der hardcodierte, komprimierte TypeSystem-Descriptor zuerst dekomprimiert.
|
||||
*
|
||||
* @param xmiInputStream InputStream des XMI-Inhalts
|
||||
* @return Den konvertierten JCas
|
||||
* @throws Exception falls beim Laden oder Deserialisieren ein Fehler auftritt
|
||||
*/
|
||||
private static JCas convertXmiToJCas(InputStream xmiInputStream) throws Exception {
|
||||
InputStream tsCompressedStream = NlpUtils.class.getResourceAsStream(TYPE_SYSTEM_DESCRIPTOR_PATH);
|
||||
if (tsCompressedStream == null) {
|
||||
throw new IllegalArgumentException("TypeSystem-Descriptor nicht gefunden: " + TYPE_SYSTEM_DESCRIPTOR_PATH);
|
||||
}
|
||||
try (GZIPInputStream tsStream = new GZIPInputStream(tsCompressedStream)) {
|
||||
XMLInputSource inputSource = new XMLInputSource(tsStream);
|
||||
TypeSystemDescription tsd = UIMAFramework.getXMLParser().parseTypeSystemDescription(inputSource);
|
||||
CAS cas = CasCreationUtils.createCas(tsd, null, null);
|
||||
XmiCasDeserializer.deserialize(xmiInputStream, cas, true);
|
||||
return cas.getJCas();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iteriert über alle Annotationen im JCas und verarbeitet diese.
|
||||
* Hier kannst du deine eigene Logik einfügen, um die JCas-Daten weiter zu verarbeiten.
|
||||
*
|
||||
* @param jcas Der konvertierte JCas
|
||||
*/
|
||||
private static List<Map<String, Object>> processJCas(JCas jcas) {
|
||||
List<Map<String, Object>> annotationsData = new ArrayList<>();
|
||||
CAS cas = jcas.getCas();
|
||||
AnnotationIndex<AnnotationFS> index = cas.getAnnotationIndex();
|
||||
for (AnnotationFS annotation : index) {
|
||||
Map<String, Object> annotationData = new HashMap<>();
|
||||
annotationData.put("type", annotation.getType().getName());
|
||||
Map<String, String> featuresMap = new HashMap<>();
|
||||
for (Feature feature : annotation.getType().getFeatures()) {
|
||||
String featureName = feature.getShortName();
|
||||
String featureValue = annotation.getFeatureValueAsString(feature);
|
||||
if (featureValue != null && featureValue.length() > MAX_FEATURE_LENGTH) {
|
||||
featureValue = featureValue.substring(0, MAX_FEATURE_LENGTH); // trim for efficiency
|
||||
}
|
||||
featuresMap.put(featureName, featureValue);
|
||||
featuresMap.put(featureName, featureValue);
|
||||
}
|
||||
annotationData.put("features", featuresMap);
|
||||
|
||||
annotationsData.add(annotationData);
|
||||
}
|
||||
return annotationsData;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue