Improved loading speech, work in progress, hotfixed RadarChart issue

2025-03-22 17:20:09 +01:00 · 2025-03-22 17:20:09 +01:00 · 861e14b64d
commit 861e14b64d
parent 76a12e5a3d
6 changed files with 136 additions and 34 deletions
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
@ -107,6 +107,9 @@ public class Main {
            Logger.pink("Adding Speeches to DB...");
            mongoDBHandler.insertSpeeches(speechIndex.getSpeeches());

+            Logger.pink("Building Metadata...");
+            MongoPprUtils.rebuildMetadata();
+
            // only upload member photos if database was empty by default, not when speeches are force-overwritten
            if (!FORCE_UPLOAD_SPEECHES) {
                Logger.pink("Uploading Member Photos to DB...");
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java
@ -417,6 +417,12 @@ public class MongoDBHandler {
        return doc;
    }

+    /**
+     * Liefert ein Feldwert aks Double, auch wenn er in der Datenbank als Integer oder String steht
+     * @param doc Mongo-Dokument
+     * @param fieldName Feldname
+     * @return Double
+     */
    public static Double getFieldAsDouble(Document doc, String fieldName) {
        Object obj = doc.get(fieldName);
        if (obj instanceof Double) return (Double)  obj;
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java
@ -1,9 +1,6 @@
 package org.texttechnologylab.project.gruppe_05_1.database;

-import com.mongodb.client.FindIterable;
-import com.mongodb.client.MongoCollection;
-import com.mongodb.client.MongoCursor;
-import com.mongodb.client.MongoDatabase;
+import com.mongodb.client.*;
 import com.mongodb.client.model.Accumulators;
 import com.mongodb.client.model.Aggregates;
 import com.mongodb.client.model.Filters;
@ -648,6 +645,11 @@ public class MongoPprUtils {
        } else return doc.getString("base64");
    }

+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+    // Metadata
+
+
    /**
     * Aktualisiert (or erzeugt, falls nicht bereits vorhanden) diverse Metadaten:
     * - Die Liste der Parteien/Fraktionen, wie sie im Speaker-Collection stehen
@ -661,30 +663,6 @@ public class MongoPprUtils {
        List<String> distinctPartiesOfSpeakers = getSpeakerCollection().distinct("party", String.class).into(new java.util.ArrayList<>());
        List<String> distinctPartiesFromSpeeches = getSpeechCollection().distinct("fraction", String.class).into(new java.util.ArrayList<>());

-        Logger.info("Collecting Topics Information");
-        Set<String> topics = new HashSet<>();
-
-        // Aggregation pipeline
-        List<Bson> pipeline = List.of(
-                Aggregates.unwind("$analysisResults.topics"),  // Unwind the "topics" array
-                Aggregates.project(Projections.fields(Projections.include("analysisResults.topics.topic"))),  // Project only the "topic" field
-                Aggregates.group(null, Accumulators.addToSet("distinctTopics", "$analysisResults.topics.topic"))  // Group to get distinct values
-        );
-        List<String> topicsList = null;
-        List<Document> results = getSpeechCollection().aggregate(pipeline).into(new java.util.ArrayList<>());
-        // Extract and print all distinct "topic" values
-        if (!results.isEmpty()) {
-            Document result = results.get(0); // Get the first (and only) document
-            List<String> distinctTopics = result.getList("distinctTopics", String.class);
-            topicsList = distinctTopics;
-            for (String topic : distinctTopics) {
-                System.out.println(topic);
-            }
-        } else {
-            System.out.println("No topics found.");
-        }
-
-
        Logger.info("Updating Metadata Collection: begin");

        MongoDBHandler.createCollection(db, METADATA_COLLECTION_NAME);
@ -701,11 +679,126 @@ public class MongoPprUtils {
        metadataCollection.replaceOne(filterPartiesOfSpeakers, partiesDocOfSpeakers, new com.mongodb.client.model.ReplaceOptions().upsert(true));


+        Logger.info("Enriching Speech Information: begin");
+        enrichSpeechDocuments();
+        Logger.info("Enriching Speech Information: end");
+
+        Logger.info("Collecting Topics Information");
+
+        Document unwindStage = new Document("$unwind", "$topics"); // Deconstruct the "topics" array
+        Document groupStage = new Document("$group", new Document("_id", "$topics")); // Group by "topics"
+        Document projectStage = new Document("$project", new Document("topic", "$_id").append("_id", 0)); // Optionally format the result
+
+        // Execute the aggregation
+        AggregateIterable<Document> result = getSpeechCollection().aggregate(Arrays.asList(unwindStage, groupStage, projectStage));
+        Set<String> topics = new HashSet<>();
+        for (Document doc : result) {
+            topics.add(doc.getString("topic"));
+        }
+
        Document filterTopics = new Document("type", "topics");
        Document topicsDoc = MongoDBHandler.createDocument(false, Map.of("type", "topics",
-                "value", topicsList));
+                "value", topics));
        metadataCollection.replaceOne(filterTopics, topicsDoc, new com.mongodb.client.model.ReplaceOptions().upsert(true));

        Logger.info("Updating Metadata Collection: end");
    }
+
+    public static List<String> getAllPartiesOfSpeakers() {
+        Document doc = MongoDBHandler.findFirstDocumentInCollection(getMetadataCollection(), "type", "parties_of_speakers");
+        if (doc == null) {return new ArrayList<>();}
+        else {
+            return new ArrayList<>(doc.getList("value", String.class));
+        }
+    }
+
+
+    public static final List<String> ALL_PARTIES_FROM_SPEECHES = Arrays.asList(
+            "Afd", "BSW", "GRÜNEN", "CDU/CSU", "LINKE", "FDP",
+            "Fraktionslos" /* auch als "fraktionslos" vorhanden!*/,
+            "SPD",
+            "keine" /* entspricht den null Wert */
+    );
+
+    /**
+     * Liefert die Liste aller Parteien/Fraktionen, welche in der Liste der Reden stehen, zurück.
+     * Diese Liste dient zur Filterung der Reden auf der entsprechenden Seite.
+     * Da die Datenqualität dieses Feldes extrem schlecht ist, muss man hier etwas tricksen:
+     * - Für Bündnis 90 / Die Grünen sind 5 unterschiedlichen Schreibweisen vorhanden
+     * - Für Die Linke sind ebenfalls 5 unterschiedlichen Schreibweisen vorhanden
+     * - Wegen der unterschiedlichen Schreibweisen muss man für das Frontend mit Pattern Matching arbeiten
+     * - Bei 6 Reden steht "SPDCDU/CSU". Diese Reden werden dann bei der Filterung nicht berücksichtigt
+     * - Für 3561 der 25387 Reden wurde keine Partei/Fraktion eingetragen. Diese Zahl ist zu hoch, um sie einfach zu ignorieren, daher der Eintrag "keine"
+     * - Beide Schreibweise "Fraktionslos" (166 Reden) und "fraktionslos" (311 Reden) sind vorhanden
+     * @return List<String> Liste aller Parteien/Fraktionen, welche in der Liste der Reden stehen
+     */
+    public static List<String> getAllPartiesFromSpeeches() {
+        return ALL_PARTIES_FROM_SPEECHES;
+    }
+
+    /**
+     * Reichere die Rede-Dokumente um Informationen an:
+     * - Datum und Uhrzeit der Rede (als DateTime und textuell):    dateTimeString , dateTime
+     * - Agenda-Titel: agendaTitel
+     * - Die Topics der Rede aus der NLP-Analyse
+     */
+    public static void enrichSpeechDocuments() {
+
+        MongoCollection<Document> collection = getSpeechCollection();
+        FindIterable<Document> documents = collection.find();
+
+        for (Document doc : documents) {
+            // Enrich with Info from Session & Agenda, which is always available
+            if ( ! doc.containsKey("dateTime")) {
+                int sessionId = doc.getInteger("sessionId");
+                int agendaItemId = doc.getInteger("agendaItemId");
+                String agendaTitel = getAgendaTitle(sessionId, agendaItemId);
+                LocalDateTime dateTime = null;
+                String dateTimeString = getSessionDateTime(sessionId);
+                if (dateTimeString != null) {
+                    for (String format : Arrays.asList("dd.MM.yyyy HH:mm",
+                            "dd.MM.yyyy H:mm",
+                            "dd.MM.yyyy HH.mm",
+                            "dd.MM.yyyy H.mm")) {
+                        dateTime = GeneralUtils.parseDateTime(dateTimeString,format);
+                        if (dateTime != null) break;
+                    }
+                    if (dateTime == null) {Logger.error(dateTimeString + " could not be parsed");}
+                }
+
+                Document updateFieldsFromSession = new Document()
+                        .append("dateTime", dateTime)
+                        .append("dateTimeString", dateTimeString)
+                        .append("agendaTitel", agendaTitel);
+
+                collection.updateOne(
+                        new Document("_id", doc.get("_id")),
+                        new Document("$set", updateFieldsFromSession)
+                );
+            }
+
+            // Enrich with NLP Info which is only available after running the analysis
+            if (( ! doc.containsKey("topics"))
+                    && (doc.containsKey("analysisResults"))) {
+                Document nlpDoc = (Document) doc.get("analysisResults");
+                if (nlpDoc.containsKey("topics")) {
+                    Set<String> topics = new HashSet<>();
+                    List<Document> topicsDocs = nlpDoc.getList("topics", Document.class);
+                    for (Document topicDoc : topicsDocs) {
+                        topics.add(topicDoc.getString("topic"));
+                    }
+
+                    Document updateFieldsFromTopics = new Document()
+                            .append("topics", topics);
+
+                    collection.updateOne(
+                            new Document("_id", doc.get("_id")),
+                            new Document("$set", updateFieldsFromTopics)
+                    );
+                }
+
+            }
+        }
+    }
+
 }
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/FrontEndController.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/FrontEndController.java
@ -63,7 +63,7 @@ public class FrontEndController {

        Map<String, Object> attributes = new HashMap<>();
        attributes.put("parlamentarier", parlamentarier);
-        attributes.put("parties", listFractionsFromMembers(MongoPprUtils.getAllParlamentarier("")));
+        attributes.put("parties", MongoPprUtils.getAllPartiesOfSpeakers());
        ctx.render("parlamentarier.ftl", attributes);
    }

--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java
@ -227,7 +227,7 @@ public class SpeechController {
        attributes.put("filter", filter == null || filter.isBlank() ? null : filter);

        // Filtern nach Partei/Fraktion
-        attributes.put("parties", listFractionsFromMembers(MongoPprUtils.getAllParlamentarier("")));
+        attributes.put("parties", MongoPprUtils.getAllPartiesFromSpeeches());

        // Filtern nach Topics - TODO
        List<String> topics = Arrays.asList("International", "Government", "Labor", "Economy", "Public");
--- a/src/main/resources/templates/sentimentsRadarChart.ftl
+++ b/src/main/resources/templates/sentimentsRadarChart.ftl
@ -5,9 +5,9 @@
    <#if sentiments?? && sentiments?size gt 0>
    <#list sentiments as sentiment>
    sentimentData.push({
-        pos: ${sentiment.positive?number},
-        neu: ${sentiment.neutral?number},
-        neg: ${sentiment.negative?number}
+        pos: ${sentiment.positive?string?replace(',', '.')},
+        neu: ${sentiment.neutral?string?replace(',', '.')},
+        neg: ${sentiment.negative?string?replace(',', '.')},
    });
    </#list>
    <#else>