Started metadata work to speed up loading process

This commit is contained in:
vysitor 2025-03-21 22:55:49 +01:00
parent 05405a9329
commit d956acc109
2 changed files with 86 additions and 0 deletions

View file

@ -28,6 +28,8 @@ public class Main {
public static boolean FORCE_UPLOAD_MEMBERS;
public static boolean FORCE_UPLOAD_SPEECHES;
public static boolean ONLY_RUN_WEB;
public static boolean REBUILD_METADATA;
public static boolean DEBUG_LOGGING;
private static final FileObjectFactory xmlFactory = FileObjectFactory.getFactory();
private static final MongoObjectFactory mongoFactory = MongoObjectFactory.getFactory();
@ -43,6 +45,7 @@ public class Main {
FORCE_UPLOAD_MEMBERS = Arrays.asList(args).contains("forceUploadMembers");
FORCE_UPLOAD_SPEECHES = Arrays.asList(args).contains("forceUploadSpeeches");
ONLY_RUN_WEB = Arrays.asList(args).contains("onlyRunWeb");
REBUILD_METADATA = Arrays.asList(args).contains("rebuildMetadata");
DEBUG_LOGGING = Arrays.asList(args).contains("debugLogging");
System.out.println("Starting Multimodal Parliament Explorer...");
@ -52,6 +55,7 @@ public class Main {
System.out.println(" - Force Upload Members: " + FORCE_UPLOAD_MEMBERS);
System.out.println(" - Force Upload Speeches: " + FORCE_UPLOAD_SPEECHES);
System.out.println(" - Only Run javalin Web Server: " + ONLY_RUN_WEB);
System.out.println(" - Rebuild Metadata: " + REBUILD_METADATA);
System.out.println(" - Debug Logging: " + DEBUG_LOGGING);
System.out.println("--------------------------------------------o");
@ -64,6 +68,12 @@ public class Main {
MongoDBHandler mongoDBHandler = new MongoDBHandler();
if (REBUILD_METADATA) {
Logger.info("Rebuilding Metadata...");
MongoPprUtils.rebuildMetadata();
System.exit(0);
}
SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl();
if ((mongoDBHandler.getDatabase().getCollection(MongoPprUtils.SPEECH_COLLECTION_NAME).countDocuments() != 0) && !FORCE_UPLOAD_SPEECHES) {
Logger.info("Skipping Speech parsing and DB insertion as they are already present...");

View file

@ -3,6 +3,9 @@ package org.texttechnologylab.project.gruppe_05_1.database;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Accumulators;
import com.mongodb.client.model.Aggregates;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.Projections;
import io.javalin.http.Context;
@ -47,6 +50,7 @@ public class MongoPprUtils {
public static final String HISTORY_COLLECTION_NAME = "history";
public static final String PICTURES_COLLECTION_NAME = "pictures";
public static final String COMMENT_COLLECTION_NAME = "comment";
public static final String METADATA_COLLECTION_NAME = "metadata";
private static MongoCollection<Document> speakerCollection = null;
private static MongoCollection<Document> speechCollection = null;
@ -54,6 +58,7 @@ public class MongoPprUtils {
private static MongoCollection<Document> agendaItemsCollection = null;
private static MongoCollection<Document> picturesCollection = null;
private static MongoCollection<Document> commentCollection = null;
private static MongoCollection<Document> metadataCollection = null;
public static MongoCollection<Document> getSpeakerCollection() {
if (speakerCollection == null) speakerCollection = MongoDBHandler.getMongoDatabase().getCollection(SPEAKER_COLLECTION_NAME);
@ -80,6 +85,11 @@ public class MongoPprUtils {
return picturesCollection;
}
public static MongoCollection<Document> getMetadataCollection() {
if (metadataCollection == null) metadataCollection = MongoDBHandler.getMongoDatabase().getCollection(METADATA_COLLECTION_NAME);
return metadataCollection;
}
/**
* Create the Speaker Collection and useful indices for it
*/
@ -628,10 +638,76 @@ public class MongoPprUtils {
// getMemberPhoto
/**
* Liefert das Bild eines Abgeordneten zurück
* @param id
* @return Base64-encoded Photo
*/
public static String getMemberPhoto(String id) {
Document doc = MongoDBHandler.findFirstDocumentInCollection(getPicturesCollection(), "memberId", id);
if (doc == null) {
return null;
} else return doc.getString("base64");
}
/**
* Aktualisiert (or erzeugt, falls nicht bereits vorhanden) diverse Metadaten:
* - Die Liste der Parteien/Fraktionen, wie sie im Speaker-Collection stehen
* - Die Liste der Parteien/Fraktionen, wie sie im Speech-Collection stehen (diese Listen sind recht unterschiedlich)
* - Topics nach NLP-Analyse der Reden
*/
public static void rebuildMetadata() {
MongoDatabase db = MongoDBHandler.getMongoDatabase();
Logger.info("Collecting Partei/Fraktion Information");
List<String> distinctPartiesOfSpeakers = getSpeakerCollection().distinct("party", String.class).into(new java.util.ArrayList<>());
List<String> distinctPartiesFromSpeeches = getSpeechCollection().distinct("fraction", String.class).into(new java.util.ArrayList<>());
Logger.info("Collecting Topics Information");
Set<String> topics = new HashSet<>();
// Aggregation pipeline
List<Bson> pipeline = List.of(
Aggregates.unwind("$analysisResults.topics"), // Unwind the "topics" array
Aggregates.project(Projections.fields(Projections.include("analysisResults.topics.topic"))), // Project only the "topic" field
Aggregates.group(null, Accumulators.addToSet("distinctTopics", "$analysisResults.topics.topic")) // Group to get distinct values
);
List<String> topicsList = null;
List<Document> results = getSpeechCollection().aggregate(pipeline).into(new java.util.ArrayList<>());
// Extract and print all distinct "topic" values
if (!results.isEmpty()) {
Document result = results.get(0); // Get the first (and only) document
List<String> distinctTopics = result.getList("distinctTopics", String.class);
topicsList = distinctTopics;
for (String topic : distinctTopics) {
System.out.println(topic);
}
} else {
System.out.println("No topics found.");
}
Logger.info("Updating Metadata Collection: begin");
MongoDBHandler.createCollection(db, METADATA_COLLECTION_NAME);
MongoCollection<Document> metadataCollection = getMetadataCollection();
Document filterPartiesFromSpeeches = new Document("type", "parties_from_speeches");
Document partiesDocFromSpeeches = MongoDBHandler.createDocument(false, Map.of("type", "parties_from_speeches",
"value", distinctPartiesFromSpeeches));
metadataCollection.replaceOne(filterPartiesFromSpeeches, partiesDocFromSpeeches, new com.mongodb.client.model.ReplaceOptions().upsert(true));
Document filterPartiesOfSpeakers = new Document("type", "parties_of_speakers");
Document partiesDocOfSpeakers = MongoDBHandler.createDocument(false, Map.of("type", "parties_of_speakers",
"value", distinctPartiesOfSpeakers));
metadataCollection.replaceOne(filterPartiesOfSpeakers, partiesDocOfSpeakers, new com.mongodb.client.model.ReplaceOptions().upsert(true));
Document filterTopics = new Document("type", "topics");
Document topicsDoc = MongoDBHandler.createDocument(false, Map.of("type", "topics",
"value", topicsList));
metadataCollection.replaceOne(filterTopics, topicsDoc, new com.mongodb.client.model.ReplaceOptions().upsert(true));
Logger.info("Updating Metadata Collection: end");
}
}