Improved loading speech, work in progress, hotfixed RadarChart issue

This commit is contained in:
vysitor 2025-03-22 17:20:09 +01:00
parent 76a12e5a3d
commit 861e14b64d
6 changed files with 136 additions and 34 deletions

View file

@ -107,6 +107,9 @@ public class Main {
Logger.pink("Adding Speeches to DB...");
mongoDBHandler.insertSpeeches(speechIndex.getSpeeches());
Logger.pink("Building Metadata...");
MongoPprUtils.rebuildMetadata();
// only upload member photos if database was empty by default, not when speeches are force-overwritten
if (!FORCE_UPLOAD_SPEECHES) {
Logger.pink("Uploading Member Photos to DB...");

View file

@ -417,6 +417,12 @@ public class MongoDBHandler {
return doc;
}
/**
* Liefert ein Feldwert aks Double, auch wenn er in der Datenbank als Integer oder String steht
* @param doc Mongo-Dokument
* @param fieldName Feldname
* @return Double
*/
public static Double getFieldAsDouble(Document doc, String fieldName) {
Object obj = doc.get(fieldName);
if (obj instanceof Double) return (Double) obj;

View file

@ -1,9 +1,6 @@
package org.texttechnologylab.project.gruppe_05_1.database;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.*;
import com.mongodb.client.model.Accumulators;
import com.mongodb.client.model.Aggregates;
import com.mongodb.client.model.Filters;
@ -648,6 +645,11 @@ public class MongoPprUtils {
} else return doc.getString("base64");
}
// - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Metadata
/**
* Aktualisiert (or erzeugt, falls nicht bereits vorhanden) diverse Metadaten:
* - Die Liste der Parteien/Fraktionen, wie sie im Speaker-Collection stehen
@ -661,30 +663,6 @@ public class MongoPprUtils {
List<String> distinctPartiesOfSpeakers = getSpeakerCollection().distinct("party", String.class).into(new java.util.ArrayList<>());
List<String> distinctPartiesFromSpeeches = getSpeechCollection().distinct("fraction", String.class).into(new java.util.ArrayList<>());
Logger.info("Collecting Topics Information");
Set<String> topics = new HashSet<>();
// Aggregation pipeline
List<Bson> pipeline = List.of(
Aggregates.unwind("$analysisResults.topics"), // Unwind the "topics" array
Aggregates.project(Projections.fields(Projections.include("analysisResults.topics.topic"))), // Project only the "topic" field
Aggregates.group(null, Accumulators.addToSet("distinctTopics", "$analysisResults.topics.topic")) // Group to get distinct values
);
List<String> topicsList = null;
List<Document> results = getSpeechCollection().aggregate(pipeline).into(new java.util.ArrayList<>());
// Extract and print all distinct "topic" values
if (!results.isEmpty()) {
Document result = results.get(0); // Get the first (and only) document
List<String> distinctTopics = result.getList("distinctTopics", String.class);
topicsList = distinctTopics;
for (String topic : distinctTopics) {
System.out.println(topic);
}
} else {
System.out.println("No topics found.");
}
Logger.info("Updating Metadata Collection: begin");
MongoDBHandler.createCollection(db, METADATA_COLLECTION_NAME);
@ -701,11 +679,126 @@ public class MongoPprUtils {
metadataCollection.replaceOne(filterPartiesOfSpeakers, partiesDocOfSpeakers, new com.mongodb.client.model.ReplaceOptions().upsert(true));
Logger.info("Enriching Speech Information: begin");
enrichSpeechDocuments();
Logger.info("Enriching Speech Information: end");
Logger.info("Collecting Topics Information");
Document unwindStage = new Document("$unwind", "$topics"); // Deconstruct the "topics" array
Document groupStage = new Document("$group", new Document("_id", "$topics")); // Group by "topics"
Document projectStage = new Document("$project", new Document("topic", "$_id").append("_id", 0)); // Optionally format the result
// Execute the aggregation
AggregateIterable<Document> result = getSpeechCollection().aggregate(Arrays.asList(unwindStage, groupStage, projectStage));
Set<String> topics = new HashSet<>();
for (Document doc : result) {
topics.add(doc.getString("topic"));
}
Document filterTopics = new Document("type", "topics");
Document topicsDoc = MongoDBHandler.createDocument(false, Map.of("type", "topics",
"value", topicsList));
"value", topics));
metadataCollection.replaceOne(filterTopics, topicsDoc, new com.mongodb.client.model.ReplaceOptions().upsert(true));
Logger.info("Updating Metadata Collection: end");
}
public static List<String> getAllPartiesOfSpeakers() {
Document doc = MongoDBHandler.findFirstDocumentInCollection(getMetadataCollection(), "type", "parties_of_speakers");
if (doc == null) {return new ArrayList<>();}
else {
return new ArrayList<>(doc.getList("value", String.class));
}
}
public static final List<String> ALL_PARTIES_FROM_SPEECHES = Arrays.asList(
"Afd", "BSW", "GRÜNEN", "CDU/CSU", "LINKE", "FDP",
"Fraktionslos" /* auch als "fraktionslos" vorhanden!*/,
"SPD",
"keine" /* entspricht den null Wert */
);
/**
* Liefert die Liste aller Parteien/Fraktionen, welche in der Liste der Reden stehen, zurück.
* Diese Liste dient zur Filterung der Reden auf der entsprechenden Seite.
* Da die Datenqualität dieses Feldes extrem schlecht ist, muss man hier etwas tricksen:
* - Für Bündnis 90 / Die Grünen sind 5 unterschiedlichen Schreibweisen vorhanden
* - Für Die Linke sind ebenfalls 5 unterschiedlichen Schreibweisen vorhanden
* - Wegen der unterschiedlichen Schreibweisen muss man für das Frontend mit Pattern Matching arbeiten
* - Bei 6 Reden steht "SPDCDU/CSU". Diese Reden werden dann bei der Filterung nicht berücksichtigt
* - Für 3561 der 25387 Reden wurde keine Partei/Fraktion eingetragen. Diese Zahl ist zu hoch, um sie einfach zu ignorieren, daher der Eintrag "keine"
* - Beide Schreibweise "Fraktionslos" (166 Reden) und "fraktionslos" (311 Reden) sind vorhanden
* @return List<String> Liste aller Parteien/Fraktionen, welche in der Liste der Reden stehen
*/
public static List<String> getAllPartiesFromSpeeches() {
return ALL_PARTIES_FROM_SPEECHES;
}
/**
* Reichere die Rede-Dokumente um Informationen an:
* - Datum und Uhrzeit der Rede (als DateTime und textuell): dateTimeString , dateTime
* - Agenda-Titel: agendaTitel
* - Die Topics der Rede aus der NLP-Analyse
*/
public static void enrichSpeechDocuments() {
MongoCollection<Document> collection = getSpeechCollection();
FindIterable<Document> documents = collection.find();
for (Document doc : documents) {
// Enrich with Info from Session & Agenda, which is always available
if ( ! doc.containsKey("dateTime")) {
int sessionId = doc.getInteger("sessionId");
int agendaItemId = doc.getInteger("agendaItemId");
String agendaTitel = getAgendaTitle(sessionId, agendaItemId);
LocalDateTime dateTime = null;
String dateTimeString = getSessionDateTime(sessionId);
if (dateTimeString != null) {
for (String format : Arrays.asList("dd.MM.yyyy HH:mm",
"dd.MM.yyyy H:mm",
"dd.MM.yyyy HH.mm",
"dd.MM.yyyy H.mm")) {
dateTime = GeneralUtils.parseDateTime(dateTimeString,format);
if (dateTime != null) break;
}
if (dateTime == null) {Logger.error(dateTimeString + " could not be parsed");}
}
Document updateFieldsFromSession = new Document()
.append("dateTime", dateTime)
.append("dateTimeString", dateTimeString)
.append("agendaTitel", agendaTitel);
collection.updateOne(
new Document("_id", doc.get("_id")),
new Document("$set", updateFieldsFromSession)
);
}
// Enrich with NLP Info which is only available after running the analysis
if (( ! doc.containsKey("topics"))
&& (doc.containsKey("analysisResults"))) {
Document nlpDoc = (Document) doc.get("analysisResults");
if (nlpDoc.containsKey("topics")) {
Set<String> topics = new HashSet<>();
List<Document> topicsDocs = nlpDoc.getList("topics", Document.class);
for (Document topicDoc : topicsDocs) {
topics.add(topicDoc.getString("topic"));
}
Document updateFieldsFromTopics = new Document()
.append("topics", topics);
collection.updateOne(
new Document("_id", doc.get("_id")),
new Document("$set", updateFieldsFromTopics)
);
}
}
}
}
}

View file

@ -63,7 +63,7 @@ public class FrontEndController {
Map<String, Object> attributes = new HashMap<>();
attributes.put("parlamentarier", parlamentarier);
attributes.put("parties", listFractionsFromMembers(MongoPprUtils.getAllParlamentarier("")));
attributes.put("parties", MongoPprUtils.getAllPartiesOfSpeakers());
ctx.render("parlamentarier.ftl", attributes);
}

View file

@ -227,7 +227,7 @@ public class SpeechController {
attributes.put("filter", filter == null || filter.isBlank() ? null : filter);
// Filtern nach Partei/Fraktion
attributes.put("parties", listFractionsFromMembers(MongoPprUtils.getAllParlamentarier("")));
attributes.put("parties", MongoPprUtils.getAllPartiesFromSpeeches());
// Filtern nach Topics - TODO
List<String> topics = Arrays.asList("International", "Government", "Labor", "Economy", "Public");

View file

@ -5,9 +5,9 @@
<#if sentiments?? && sentiments?size gt 0>
<#list sentiments as sentiment>
sentimentData.push({
pos: ${sentiment.positive?number},
neu: ${sentiment.neutral?number},
neg: ${sentiment.negative?number}
pos: ${sentiment.positive?string?replace(',', '.')},
neu: ${sentiment.neutral?string?replace(',', '.')},
neg: ${sentiment.negative?string?replace(',', '.')},
});
</#list>
<#else>

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Before After
Before After