Updated documentation
This commit is contained in:
parent
62a35e30c7
commit
995d811a93
5 changed files with 39 additions and 27 deletions
|
@ -6,7 +6,6 @@ import com.mongodb.client.model.WriteModel;
|
|||
import org.apache.uima.fit.util.JCasUtil;
|
||||
import org.bson.Document;
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
|
@ -24,7 +23,10 @@ import org.texttechnologylab.project.gruppe_05_1.util.Logger;
|
|||
|
||||
import static org.texttechnologylab.project.gruppe_05_1.Main.JCAS_SPEECHES_TYPESYSTEM_DIR;
|
||||
|
||||
|
||||
/**
|
||||
* Extrahiert NLP‑Analyseergebnisse aus komprimierten XMI‑Dateien und lädt sie batchweise in MongoDB hoch.
|
||||
* Liest .xmi.gz Dateien aus einem ZIP‑Archiv, erstellt für jede Rede ein "analysisResults" Dokument und führt Bulk‑Writes durch.
|
||||
*/
|
||||
public class XmiExtractor {
|
||||
|
||||
private final List<WriteModel<Document>> bulkOperations = Collections.synchronizedList(new ArrayList<>());
|
||||
|
@ -36,6 +38,11 @@ public class XmiExtractor {
|
|||
mongoDBHandler = new MongoDBHandler();
|
||||
}
|
||||
|
||||
/**
|
||||
* Liest alle .xmi.gz Dateien aus dem ZIP‑Archiv "speeches/20.zip", extrahiert deren NLP‑Daten
|
||||
* und lädt sie in Form von BATCH_SIZE Dokumenten in MongoDB hoch.
|
||||
* @throws IOException falls die ZIP‑Datei nicht gefunden oder ein Lese-/Schreibfehler auftritt
|
||||
*/
|
||||
public void extractAndUploadXmiData() throws IOException {
|
||||
InputStream resourceStream = getClass().getClassLoader().getResourceAsStream("speeches/20.zip");
|
||||
if (resourceStream == null) {
|
||||
|
@ -85,6 +92,11 @@ public class XmiExtractor {
|
|||
mongoDBHandler.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Liest ein komprimiertes XMI ein und erstellt Bulk‑Update‑Operationen für MongoDB.
|
||||
* @param inputStream komprimierter XMI‑InputStream
|
||||
* @param filename Name der Datei im ZIP‑Archiv (für speechKey)
|
||||
*/
|
||||
private void processXmiGzStream(InputStream inputStream, String filename) {
|
||||
JCas jCas = null;
|
||||
try (GZIPInputStream gis = new GZIPInputStream(inputStream)) {
|
||||
|
@ -181,11 +193,19 @@ public class XmiExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extrahiert aus dem Dateinamen (z.B. "20/ABC123.xmi.gz") den speechKey.
|
||||
* @param filename Name der Datei innerhalb des ZIP‑Archivs
|
||||
* @return speechKey oder null, wenn das Format nicht erkannt wird
|
||||
*/
|
||||
private static String extractSpeechKeyFromFilename(String filename) {
|
||||
String baseName = filename.replace(".xmi.gz", "");
|
||||
return baseName.replace("20/", "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Führt alle gesammelten BulkWrite-Operationen in MongoDB aus und leert den Batch.
|
||||
*/
|
||||
private synchronized void flushBatch() {
|
||||
if (!bulkOperations.isEmpty()) {
|
||||
mongoDBHandler.bulkWriteNlpData(bulkOperations);
|
||||
|
|
|
@ -1,32 +1,20 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.rest;
|
||||
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import gnu.trove.impl.sync.TSynchronizedShortObjectMap;
|
||||
import io.javalin.http.Context;
|
||||
import io.javalin.openapi.*;
|
||||
import org.apache.commons.collections.bag.SynchronizedSortedBag;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoPprUtils;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.HtmlSpeech;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.Parlamentarier;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.ParlamentarierDetails;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.NamedEntity;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Sentiment;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Token;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Topic;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.speech.SpeechMetaData;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.Logger;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
|
||||
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speech;
|
||||
import com.mongodb.client.AggregateIterable;
|
||||
import com.mongodb.client.model.Aggregates;
|
||||
import com.mongodb.client.model.Accumulators;
|
||||
import com.mongodb.client.model.Facet;
|
||||
import com.mongodb.client.model.Filters;
|
||||
import com.mongodb.client.model.Projections;
|
||||
import org.bson.Document;
|
||||
import org.bson.conversions.Bson;
|
||||
import java.util.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
@ -35,8 +23,6 @@ import java.util.Map;
|
|||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static javax.management.Query.eq;
|
||||
import static org.texttechnologylab.project.gruppe_05_1.util.PPRUtils.listFractionsFromMembers;
|
||||
|
||||
public class FrontEndController {
|
||||
@OpenApi(
|
||||
|
@ -81,6 +67,11 @@ public class FrontEndController {
|
|||
ctx.render("parlamentarier.ftl", attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregiert für alle Reden die NLP‑Ergebnisse (Topics, POS, Named Entities, erste Sentiment‑Objekte)
|
||||
* und liefert die zusammengefassten Daten an die Charts‑Ansicht.
|
||||
* @param ctx Javalin Context zum Rendern der Seite mit den aggregierten Chart‑Daten
|
||||
*/
|
||||
public static void getCharts(Context ctx) {
|
||||
MongoCollection<Document> col = MongoPprUtils.getSpeechCollection();
|
||||
|
||||
|
|
|
@ -105,7 +105,7 @@ public class SpeechController {
|
|||
Map<String, Integer> posCounts = Token.countPOS(tokens);
|
||||
|
||||
List<Token> posList = posCounts.entrySet().stream()
|
||||
.map(entry -> new Token(entry.getKey(), String.valueOf(entry.getValue()), "")) // Lemma remains empty
|
||||
.map(entry -> new Token(entry.getKey(), String.valueOf(entry.getValue()), ""))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
Logger.debug("Sending POS List to NLP - " + posList);
|
||||
|
@ -114,7 +114,7 @@ public class SpeechController {
|
|||
|
||||
} else {
|
||||
Logger.debug("POS List is EMPTY");
|
||||
speech.getNlp().setPosList((List) new ArrayList<Token>()); // Ensure it's never null
|
||||
speech.getNlp().setPosList((List) new ArrayList<Token>());
|
||||
}
|
||||
|
||||
// NLP: Named Entities
|
||||
|
@ -151,10 +151,8 @@ public class SpeechController {
|
|||
attributes.put("na_info", null);
|
||||
}
|
||||
|
||||
// TODO: Token wird momentan etwas komisch abgespeichert, da im Attribut text die POS art steht, und in pos die Anzahl dieser POS arten. Umstrukturieren damit keine Verwirrung herrscht
|
||||
|
||||
// NLP: Sentiments
|
||||
// Der erste Sentiment gilt der gesamten Rede. Die weitere Sentiments entsprechen die Sätze.
|
||||
// Der erste Sentiment gilt der gesamten Rede. Die weitere Sentiments entsprechen die Sätze. overallSentiments speichert alle Analyseobjekte und sentiments nur die der einzelnen Sätze
|
||||
List<Sentiment> sentiments = speech.getNlp().getSentiments();
|
||||
if ((sentiments != null) && ! sentiments.isEmpty()) {
|
||||
List<Sentiment> overallSentiments = new ArrayList<>(sentiments);
|
||||
|
|
|
@ -30,7 +30,6 @@
|
|||
<div class="chart">
|
||||
<#if aggregatedSentiment?? && (aggregatedSentiment?size gt 0)>
|
||||
<h3>Sentiments Information (als Radar Chart)</h3>
|
||||
<#-- Wrap the aggregated sentiment map in a list so that the chart partial can iterate -->
|
||||
<#assign sentiments = [aggregatedSentiment]>
|
||||
<#include "sentimentsRadarChart.ftl">
|
||||
<#else>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue