Started metadata work to speed up loading process

2025-03-21 22:55:49 +01:00 · 2025-03-21 22:55:49 +01:00 · d956acc109
commit d956acc109
parent 05405a9329
2 changed files with 86 additions and 0 deletions
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java
@ -28,6 +28,8 @@ public class Main {
    public static boolean FORCE_UPLOAD_MEMBERS;
    public static boolean FORCE_UPLOAD_SPEECHES;
    public static boolean ONLY_RUN_WEB;
+
+    public static boolean REBUILD_METADATA;
    public static boolean DEBUG_LOGGING;
    private static final FileObjectFactory xmlFactory = FileObjectFactory.getFactory();
    private static final MongoObjectFactory mongoFactory = MongoObjectFactory.getFactory();
@ -43,6 +45,7 @@ public class Main {
        FORCE_UPLOAD_MEMBERS = Arrays.asList(args).contains("forceUploadMembers");
        FORCE_UPLOAD_SPEECHES = Arrays.asList(args).contains("forceUploadSpeeches");
        ONLY_RUN_WEB = Arrays.asList(args).contains("onlyRunWeb");
+        REBUILD_METADATA = Arrays.asList(args).contains("rebuildMetadata");
        DEBUG_LOGGING = Arrays.asList(args).contains("debugLogging");

        System.out.println("Starting Multimodal Parliament Explorer...");
@ -52,6 +55,7 @@ public class Main {
        System.out.println(" - Force Upload Members:               " + FORCE_UPLOAD_MEMBERS);
        System.out.println(" - Force Upload Speeches:              " + FORCE_UPLOAD_SPEECHES);
        System.out.println(" - Only Run javalin Web Server:        " + ONLY_RUN_WEB);
+        System.out.println(" - Rebuild Metadata:                   " + REBUILD_METADATA);
        System.out.println(" - Debug Logging:                      " + DEBUG_LOGGING);
        System.out.println("--------------------------------------------o");

@ -64,6 +68,12 @@ public class Main {

        MongoDBHandler mongoDBHandler = new MongoDBHandler();

+        if (REBUILD_METADATA) {
+            Logger.info("Rebuilding Metadata...");
+            MongoPprUtils.rebuildMetadata();
+            System.exit(0);
+        }
+
        SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl();
        if ((mongoDBHandler.getDatabase().getCollection(MongoPprUtils.SPEECH_COLLECTION_NAME).countDocuments() != 0) && !FORCE_UPLOAD_SPEECHES) {
            Logger.info("Skipping Speech parsing and DB insertion as they are already present...");
--- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java
+++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java
@ -3,6 +3,9 @@ package org.texttechnologylab.project.gruppe_05_1.database;
 import com.mongodb.client.FindIterable;
 import com.mongodb.client.MongoCollection;
 import com.mongodb.client.MongoCursor;
+import com.mongodb.client.MongoDatabase;
+import com.mongodb.client.model.Accumulators;
+import com.mongodb.client.model.Aggregates;
 import com.mongodb.client.model.Filters;
 import com.mongodb.client.model.Projections;
 import io.javalin.http.Context;
@ -47,6 +50,7 @@ public class MongoPprUtils {
    public static final String HISTORY_COLLECTION_NAME = "history";
    public static final String PICTURES_COLLECTION_NAME = "pictures";
    public static final String COMMENT_COLLECTION_NAME = "comment";
+    public static final String METADATA_COLLECTION_NAME = "metadata";

    private static MongoCollection<Document> speakerCollection = null;
    private static MongoCollection<Document> speechCollection = null;
@ -54,6 +58,7 @@ public class MongoPprUtils {
    private static MongoCollection<Document> agendaItemsCollection = null;
    private static MongoCollection<Document> picturesCollection = null;
    private static MongoCollection<Document> commentCollection = null;
+    private static MongoCollection<Document> metadataCollection = null;

    public static MongoCollection<Document> getSpeakerCollection() {
        if (speakerCollection == null) speakerCollection = MongoDBHandler.getMongoDatabase().getCollection(SPEAKER_COLLECTION_NAME);
@ -80,6 +85,11 @@ public class MongoPprUtils {
        return picturesCollection;
    }

+    public static MongoCollection<Document> getMetadataCollection() {
+        if (metadataCollection == null) metadataCollection = MongoDBHandler.getMongoDatabase().getCollection(METADATA_COLLECTION_NAME);
+        return metadataCollection;
+    }
+
    /**
     * Create the Speaker Collection and useful indices for it
     */
@ -628,10 +638,76 @@ public class MongoPprUtils {

    // getMemberPhoto

+    /**
+     * Liefert das Bild eines Abgeordneten zurück
+     * @param id
+     * @return Base64-encoded Photo
+     */
    public static String getMemberPhoto(String id) {
        Document doc = MongoDBHandler.findFirstDocumentInCollection(getPicturesCollection(), "memberId", id);
        if (doc == null) {
            return null;
        } else return doc.getString("base64");
    }
+
+    /**
+     * Aktualisiert (or erzeugt, falls nicht bereits vorhanden) diverse Metadaten:
+     * - Die Liste der Parteien/Fraktionen, wie sie im Speaker-Collection stehen
+     * - Die Liste der Parteien/Fraktionen, wie sie im Speech-Collection stehen (diese Listen sind recht unterschiedlich)
+     * - Topics nach NLP-Analyse der Reden
+     */
+    public static void rebuildMetadata() {
+        MongoDatabase db = MongoDBHandler.getMongoDatabase();
+
+        Logger.info("Collecting Partei/Fraktion Information");
+        List<String> distinctPartiesOfSpeakers = getSpeakerCollection().distinct("party", String.class).into(new java.util.ArrayList<>());
+        List<String> distinctPartiesFromSpeeches = getSpeechCollection().distinct("fraction", String.class).into(new java.util.ArrayList<>());
+
+        Logger.info("Collecting Topics Information");
+        Set<String> topics = new HashSet<>();
+
+        // Aggregation pipeline
+        List<Bson> pipeline = List.of(
+                Aggregates.unwind("$analysisResults.topics"),  // Unwind the "topics" array
+                Aggregates.project(Projections.fields(Projections.include("analysisResults.topics.topic"))),  // Project only the "topic" field
+                Aggregates.group(null, Accumulators.addToSet("distinctTopics", "$analysisResults.topics.topic"))  // Group to get distinct values
+        );
+        List<String> topicsList = null;
+        List<Document> results = getSpeechCollection().aggregate(pipeline).into(new java.util.ArrayList<>());
+        // Extract and print all distinct "topic" values
+        if (!results.isEmpty()) {
+            Document result = results.get(0); // Get the first (and only) document
+            List<String> distinctTopics = result.getList("distinctTopics", String.class);
+            topicsList = distinctTopics;
+            for (String topic : distinctTopics) {
+                System.out.println(topic);
+            }
+        } else {
+            System.out.println("No topics found.");
+        }
+
+
+        Logger.info("Updating Metadata Collection: begin");
+
+        MongoDBHandler.createCollection(db, METADATA_COLLECTION_NAME);
+        MongoCollection<Document> metadataCollection = getMetadataCollection();
+
+        Document filterPartiesFromSpeeches = new Document("type", "parties_from_speeches");
+        Document partiesDocFromSpeeches = MongoDBHandler.createDocument(false, Map.of("type", "parties_from_speeches",
+                "value", distinctPartiesFromSpeeches));
+        metadataCollection.replaceOne(filterPartiesFromSpeeches, partiesDocFromSpeeches, new com.mongodb.client.model.ReplaceOptions().upsert(true));
+
+        Document filterPartiesOfSpeakers = new Document("type", "parties_of_speakers");
+        Document partiesDocOfSpeakers = MongoDBHandler.createDocument(false, Map.of("type", "parties_of_speakers",
+                "value", distinctPartiesOfSpeakers));
+        metadataCollection.replaceOne(filterPartiesOfSpeakers, partiesDocOfSpeakers, new com.mongodb.client.model.ReplaceOptions().upsert(true));
+
+
+        Document filterTopics = new Document("type", "topics");
+        Document topicsDoc = MongoDBHandler.createDocument(false, Map.of("type", "topics",
+                "value", topicsList));
+        metadataCollection.replaceOne(filterTopics, topicsDoc, new com.mongodb.client.model.ReplaceOptions().upsert(true));
+
+        Logger.info("Updating Metadata Collection: end");
+    }
 }