From 861e14b64db71e588db1ef816c4a447a1319abce Mon Sep 17 00:00:00 2001 From: vysitor Date: Sat, 22 Mar 2025 17:20:09 +0100 Subject: [PATCH] Improved loading speech, work in progress, hotfixed RadarChart issue --- .../project/gruppe_05_1/Main.java | 3 + .../gruppe_05_1/database/MongoDBHandler.java | 6 + .../gruppe_05_1/database/MongoPprUtils.java | 151 ++++++++++++++---- .../gruppe_05_1/rest/FrontEndController.java | 2 +- .../gruppe_05_1/rest/SpeechController.java | 2 +- .../templates/sentimentsRadarChart.ftl | 6 +- 6 files changed, 136 insertions(+), 34 deletions(-) diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java index 85aeddb..098d601 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java @@ -107,6 +107,9 @@ public class Main { Logger.pink("Adding Speeches to DB..."); mongoDBHandler.insertSpeeches(speechIndex.getSpeeches()); + Logger.pink("Building Metadata..."); + MongoPprUtils.rebuildMetadata(); + // only upload member photos if database was empty by default, not when speeches are force-overwritten if (!FORCE_UPLOAD_SPEECHES) { Logger.pink("Uploading Member Photos to DB..."); diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java index cde1b92..fc263d4 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java @@ -417,6 +417,12 @@ public class MongoDBHandler { return doc; } + /** + * Liefert ein Feldwert aks Double, auch wenn er in der Datenbank als Integer oder String steht + * @param doc Mongo-Dokument + * @param fieldName Feldname + * @return Double + */ public static Double getFieldAsDouble(Document doc, String fieldName) { Object obj = doc.get(fieldName); if (obj instanceof Double) return (Double) obj; diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java index 73c6d90..cfd5679 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java @@ -1,9 +1,6 @@ package org.texttechnologylab.project.gruppe_05_1.database; -import com.mongodb.client.FindIterable; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoCursor; -import com.mongodb.client.MongoDatabase; +import com.mongodb.client.*; import com.mongodb.client.model.Accumulators; import com.mongodb.client.model.Aggregates; import com.mongodb.client.model.Filters; @@ -648,6 +645,11 @@ public class MongoPprUtils { } else return doc.getString("base64"); } + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + // Metadata + + /** * Aktualisiert (or erzeugt, falls nicht bereits vorhanden) diverse Metadaten: * - Die Liste der Parteien/Fraktionen, wie sie im Speaker-Collection stehen @@ -661,30 +663,6 @@ public class MongoPprUtils { List distinctPartiesOfSpeakers = getSpeakerCollection().distinct("party", String.class).into(new java.util.ArrayList<>()); List distinctPartiesFromSpeeches = getSpeechCollection().distinct("fraction", String.class).into(new java.util.ArrayList<>()); - Logger.info("Collecting Topics Information"); - Set topics = new HashSet<>(); - - // Aggregation pipeline - List pipeline = List.of( - Aggregates.unwind("$analysisResults.topics"), // Unwind the "topics" array - Aggregates.project(Projections.fields(Projections.include("analysisResults.topics.topic"))), // Project only the "topic" field - Aggregates.group(null, Accumulators.addToSet("distinctTopics", "$analysisResults.topics.topic")) // Group to get distinct values - ); - List topicsList = null; - List results = getSpeechCollection().aggregate(pipeline).into(new java.util.ArrayList<>()); - // Extract and print all distinct "topic" values - if (!results.isEmpty()) { - Document result = results.get(0); // Get the first (and only) document - List distinctTopics = result.getList("distinctTopics", String.class); - topicsList = distinctTopics; - for (String topic : distinctTopics) { - System.out.println(topic); - } - } else { - System.out.println("No topics found."); - } - - Logger.info("Updating Metadata Collection: begin"); MongoDBHandler.createCollection(db, METADATA_COLLECTION_NAME); @@ -701,11 +679,126 @@ public class MongoPprUtils { metadataCollection.replaceOne(filterPartiesOfSpeakers, partiesDocOfSpeakers, new com.mongodb.client.model.ReplaceOptions().upsert(true)); + Logger.info("Enriching Speech Information: begin"); + enrichSpeechDocuments(); + Logger.info("Enriching Speech Information: end"); + + Logger.info("Collecting Topics Information"); + + Document unwindStage = new Document("$unwind", "$topics"); // Deconstruct the "topics" array + Document groupStage = new Document("$group", new Document("_id", "$topics")); // Group by "topics" + Document projectStage = new Document("$project", new Document("topic", "$_id").append("_id", 0)); // Optionally format the result + + // Execute the aggregation + AggregateIterable result = getSpeechCollection().aggregate(Arrays.asList(unwindStage, groupStage, projectStage)); + Set topics = new HashSet<>(); + for (Document doc : result) { + topics.add(doc.getString("topic")); + } + Document filterTopics = new Document("type", "topics"); Document topicsDoc = MongoDBHandler.createDocument(false, Map.of("type", "topics", - "value", topicsList)); + "value", topics)); metadataCollection.replaceOne(filterTopics, topicsDoc, new com.mongodb.client.model.ReplaceOptions().upsert(true)); Logger.info("Updating Metadata Collection: end"); } + + public static List getAllPartiesOfSpeakers() { + Document doc = MongoDBHandler.findFirstDocumentInCollection(getMetadataCollection(), "type", "parties_of_speakers"); + if (doc == null) {return new ArrayList<>();} + else { + return new ArrayList<>(doc.getList("value", String.class)); + } + } + + + public static final List ALL_PARTIES_FROM_SPEECHES = Arrays.asList( + "Afd", "BSW", "GRÜNEN", "CDU/CSU", "LINKE", "FDP", + "Fraktionslos" /* auch als "fraktionslos" vorhanden!*/, + "SPD", + "keine" /* entspricht den null Wert */ + ); + + /** + * Liefert die Liste aller Parteien/Fraktionen, welche in der Liste der Reden stehen, zurück. + * Diese Liste dient zur Filterung der Reden auf der entsprechenden Seite. + * Da die Datenqualität dieses Feldes extrem schlecht ist, muss man hier etwas tricksen: + * - Für Bündnis 90 / Die Grünen sind 5 unterschiedlichen Schreibweisen vorhanden + * - Für Die Linke sind ebenfalls 5 unterschiedlichen Schreibweisen vorhanden + * - Wegen der unterschiedlichen Schreibweisen muss man für das Frontend mit Pattern Matching arbeiten + * - Bei 6 Reden steht "SPDCDU/CSU". Diese Reden werden dann bei der Filterung nicht berücksichtigt + * - Für 3561 der 25387 Reden wurde keine Partei/Fraktion eingetragen. Diese Zahl ist zu hoch, um sie einfach zu ignorieren, daher der Eintrag "keine" + * - Beide Schreibweise "Fraktionslos" (166 Reden) und "fraktionslos" (311 Reden) sind vorhanden + * @return List Liste aller Parteien/Fraktionen, welche in der Liste der Reden stehen + */ + public static List getAllPartiesFromSpeeches() { + return ALL_PARTIES_FROM_SPEECHES; + } + + /** + * Reichere die Rede-Dokumente um Informationen an: + * - Datum und Uhrzeit der Rede (als DateTime und textuell): dateTimeString , dateTime + * - Agenda-Titel: agendaTitel + * - Die Topics der Rede aus der NLP-Analyse + */ + public static void enrichSpeechDocuments() { + + MongoCollection collection = getSpeechCollection(); + FindIterable documents = collection.find(); + + for (Document doc : documents) { + // Enrich with Info from Session & Agenda, which is always available + if ( ! doc.containsKey("dateTime")) { + int sessionId = doc.getInteger("sessionId"); + int agendaItemId = doc.getInteger("agendaItemId"); + String agendaTitel = getAgendaTitle(sessionId, agendaItemId); + LocalDateTime dateTime = null; + String dateTimeString = getSessionDateTime(sessionId); + if (dateTimeString != null) { + for (String format : Arrays.asList("dd.MM.yyyy HH:mm", + "dd.MM.yyyy H:mm", + "dd.MM.yyyy HH.mm", + "dd.MM.yyyy H.mm")) { + dateTime = GeneralUtils.parseDateTime(dateTimeString,format); + if (dateTime != null) break; + } + if (dateTime == null) {Logger.error(dateTimeString + " could not be parsed");} + } + + Document updateFieldsFromSession = new Document() + .append("dateTime", dateTime) + .append("dateTimeString", dateTimeString) + .append("agendaTitel", agendaTitel); + + collection.updateOne( + new Document("_id", doc.get("_id")), + new Document("$set", updateFieldsFromSession) + ); + } + + // Enrich with NLP Info which is only available after running the analysis + if (( ! doc.containsKey("topics")) + && (doc.containsKey("analysisResults"))) { + Document nlpDoc = (Document) doc.get("analysisResults"); + if (nlpDoc.containsKey("topics")) { + Set topics = new HashSet<>(); + List topicsDocs = nlpDoc.getList("topics", Document.class); + for (Document topicDoc : topicsDocs) { + topics.add(topicDoc.getString("topic")); + } + + Document updateFieldsFromTopics = new Document() + .append("topics", topics); + + collection.updateOne( + new Document("_id", doc.get("_id")), + new Document("$set", updateFieldsFromTopics) + ); + } + + } + } + } + } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/FrontEndController.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/FrontEndController.java index 359d8ad..c5f38df 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/FrontEndController.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/FrontEndController.java @@ -63,7 +63,7 @@ public class FrontEndController { Map attributes = new HashMap<>(); attributes.put("parlamentarier", parlamentarier); - attributes.put("parties", listFractionsFromMembers(MongoPprUtils.getAllParlamentarier(""))); + attributes.put("parties", MongoPprUtils.getAllPartiesOfSpeakers()); ctx.render("parlamentarier.ftl", attributes); } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java index 85e5d19..d8bee69 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java @@ -227,7 +227,7 @@ public class SpeechController { attributes.put("filter", filter == null || filter.isBlank() ? null : filter); // Filtern nach Partei/Fraktion - attributes.put("parties", listFractionsFromMembers(MongoPprUtils.getAllParlamentarier(""))); + attributes.put("parties", MongoPprUtils.getAllPartiesFromSpeeches()); // Filtern nach Topics - TODO List topics = Arrays.asList("International", "Government", "Labor", "Economy", "Public"); diff --git a/src/main/resources/templates/sentimentsRadarChart.ftl b/src/main/resources/templates/sentimentsRadarChart.ftl index 882e3a6..db70cba 100644 --- a/src/main/resources/templates/sentimentsRadarChart.ftl +++ b/src/main/resources/templates/sentimentsRadarChart.ftl @@ -5,9 +5,9 @@ <#if sentiments?? && sentiments?size gt 0> <#list sentiments as sentiment> sentimentData.push({ - pos: ${sentiment.positive?number}, - neu: ${sentiment.neutral?number}, - neg: ${sentiment.negative?number} + pos: ${sentiment.positive?string?replace(',', '.')}, + neu: ${sentiment.neutral?string?replace(',', '.')}, + neg: ${sentiment.negative?string?replace(',', '.')}, }); <#else>