Started metadata work to speed up loading process
This commit is contained in:
parent
05405a9329
commit
d956acc109
2 changed files with 86 additions and 0 deletions
|
@ -28,6 +28,8 @@ public class Main {
|
|||
public static boolean FORCE_UPLOAD_MEMBERS;
|
||||
public static boolean FORCE_UPLOAD_SPEECHES;
|
||||
public static boolean ONLY_RUN_WEB;
|
||||
|
||||
public static boolean REBUILD_METADATA;
|
||||
public static boolean DEBUG_LOGGING;
|
||||
private static final FileObjectFactory xmlFactory = FileObjectFactory.getFactory();
|
||||
private static final MongoObjectFactory mongoFactory = MongoObjectFactory.getFactory();
|
||||
|
@ -43,6 +45,7 @@ public class Main {
|
|||
FORCE_UPLOAD_MEMBERS = Arrays.asList(args).contains("forceUploadMembers");
|
||||
FORCE_UPLOAD_SPEECHES = Arrays.asList(args).contains("forceUploadSpeeches");
|
||||
ONLY_RUN_WEB = Arrays.asList(args).contains("onlyRunWeb");
|
||||
REBUILD_METADATA = Arrays.asList(args).contains("rebuildMetadata");
|
||||
DEBUG_LOGGING = Arrays.asList(args).contains("debugLogging");
|
||||
|
||||
System.out.println("Starting Multimodal Parliament Explorer...");
|
||||
|
@ -52,6 +55,7 @@ public class Main {
|
|||
System.out.println(" - Force Upload Members: " + FORCE_UPLOAD_MEMBERS);
|
||||
System.out.println(" - Force Upload Speeches: " + FORCE_UPLOAD_SPEECHES);
|
||||
System.out.println(" - Only Run javalin Web Server: " + ONLY_RUN_WEB);
|
||||
System.out.println(" - Rebuild Metadata: " + REBUILD_METADATA);
|
||||
System.out.println(" - Debug Logging: " + DEBUG_LOGGING);
|
||||
System.out.println("--------------------------------------------o");
|
||||
|
||||
|
@ -64,6 +68,12 @@ public class Main {
|
|||
|
||||
MongoDBHandler mongoDBHandler = new MongoDBHandler();
|
||||
|
||||
if (REBUILD_METADATA) {
|
||||
Logger.info("Rebuilding Metadata...");
|
||||
MongoPprUtils.rebuildMetadata();
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
SpeechIndexFactoryImpl speechIndexFactory = new SpeechIndexFactoryImpl();
|
||||
if ((mongoDBHandler.getDatabase().getCollection(MongoPprUtils.SPEECH_COLLECTION_NAME).countDocuments() != 0) && !FORCE_UPLOAD_SPEECHES) {
|
||||
Logger.info("Skipping Speech parsing and DB insertion as they are already present...");
|
||||
|
|
|
@ -3,6 +3,9 @@ package org.texttechnologylab.project.gruppe_05_1.database;
|
|||
import com.mongodb.client.FindIterable;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoCursor;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import com.mongodb.client.model.Accumulators;
|
||||
import com.mongodb.client.model.Aggregates;
|
||||
import com.mongodb.client.model.Filters;
|
||||
import com.mongodb.client.model.Projections;
|
||||
import io.javalin.http.Context;
|
||||
|
@ -47,6 +50,7 @@ public class MongoPprUtils {
|
|||
public static final String HISTORY_COLLECTION_NAME = "history";
|
||||
public static final String PICTURES_COLLECTION_NAME = "pictures";
|
||||
public static final String COMMENT_COLLECTION_NAME = "comment";
|
||||
public static final String METADATA_COLLECTION_NAME = "metadata";
|
||||
|
||||
private static MongoCollection<Document> speakerCollection = null;
|
||||
private static MongoCollection<Document> speechCollection = null;
|
||||
|
@ -54,6 +58,7 @@ public class MongoPprUtils {
|
|||
private static MongoCollection<Document> agendaItemsCollection = null;
|
||||
private static MongoCollection<Document> picturesCollection = null;
|
||||
private static MongoCollection<Document> commentCollection = null;
|
||||
private static MongoCollection<Document> metadataCollection = null;
|
||||
|
||||
public static MongoCollection<Document> getSpeakerCollection() {
|
||||
if (speakerCollection == null) speakerCollection = MongoDBHandler.getMongoDatabase().getCollection(SPEAKER_COLLECTION_NAME);
|
||||
|
@ -80,6 +85,11 @@ public class MongoPprUtils {
|
|||
return picturesCollection;
|
||||
}
|
||||
|
||||
public static MongoCollection<Document> getMetadataCollection() {
|
||||
if (metadataCollection == null) metadataCollection = MongoDBHandler.getMongoDatabase().getCollection(METADATA_COLLECTION_NAME);
|
||||
return metadataCollection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the Speaker Collection and useful indices for it
|
||||
*/
|
||||
|
@ -628,10 +638,76 @@ public class MongoPprUtils {
|
|||
|
||||
// getMemberPhoto
|
||||
|
||||
/**
|
||||
* Liefert das Bild eines Abgeordneten zurück
|
||||
* @param id
|
||||
* @return Base64-encoded Photo
|
||||
*/
|
||||
public static String getMemberPhoto(String id) {
|
||||
Document doc = MongoDBHandler.findFirstDocumentInCollection(getPicturesCollection(), "memberId", id);
|
||||
if (doc == null) {
|
||||
return null;
|
||||
} else return doc.getString("base64");
|
||||
}
|
||||
|
||||
/**
|
||||
* Aktualisiert (or erzeugt, falls nicht bereits vorhanden) diverse Metadaten:
|
||||
* - Die Liste der Parteien/Fraktionen, wie sie im Speaker-Collection stehen
|
||||
* - Die Liste der Parteien/Fraktionen, wie sie im Speech-Collection stehen (diese Listen sind recht unterschiedlich)
|
||||
* - Topics nach NLP-Analyse der Reden
|
||||
*/
|
||||
public static void rebuildMetadata() {
|
||||
MongoDatabase db = MongoDBHandler.getMongoDatabase();
|
||||
|
||||
Logger.info("Collecting Partei/Fraktion Information");
|
||||
List<String> distinctPartiesOfSpeakers = getSpeakerCollection().distinct("party", String.class).into(new java.util.ArrayList<>());
|
||||
List<String> distinctPartiesFromSpeeches = getSpeechCollection().distinct("fraction", String.class).into(new java.util.ArrayList<>());
|
||||
|
||||
Logger.info("Collecting Topics Information");
|
||||
Set<String> topics = new HashSet<>();
|
||||
|
||||
// Aggregation pipeline
|
||||
List<Bson> pipeline = List.of(
|
||||
Aggregates.unwind("$analysisResults.topics"), // Unwind the "topics" array
|
||||
Aggregates.project(Projections.fields(Projections.include("analysisResults.topics.topic"))), // Project only the "topic" field
|
||||
Aggregates.group(null, Accumulators.addToSet("distinctTopics", "$analysisResults.topics.topic")) // Group to get distinct values
|
||||
);
|
||||
List<String> topicsList = null;
|
||||
List<Document> results = getSpeechCollection().aggregate(pipeline).into(new java.util.ArrayList<>());
|
||||
// Extract and print all distinct "topic" values
|
||||
if (!results.isEmpty()) {
|
||||
Document result = results.get(0); // Get the first (and only) document
|
||||
List<String> distinctTopics = result.getList("distinctTopics", String.class);
|
||||
topicsList = distinctTopics;
|
||||
for (String topic : distinctTopics) {
|
||||
System.out.println(topic);
|
||||
}
|
||||
} else {
|
||||
System.out.println("No topics found.");
|
||||
}
|
||||
|
||||
|
||||
Logger.info("Updating Metadata Collection: begin");
|
||||
|
||||
MongoDBHandler.createCollection(db, METADATA_COLLECTION_NAME);
|
||||
MongoCollection<Document> metadataCollection = getMetadataCollection();
|
||||
|
||||
Document filterPartiesFromSpeeches = new Document("type", "parties_from_speeches");
|
||||
Document partiesDocFromSpeeches = MongoDBHandler.createDocument(false, Map.of("type", "parties_from_speeches",
|
||||
"value", distinctPartiesFromSpeeches));
|
||||
metadataCollection.replaceOne(filterPartiesFromSpeeches, partiesDocFromSpeeches, new com.mongodb.client.model.ReplaceOptions().upsert(true));
|
||||
|
||||
Document filterPartiesOfSpeakers = new Document("type", "parties_of_speakers");
|
||||
Document partiesDocOfSpeakers = MongoDBHandler.createDocument(false, Map.of("type", "parties_of_speakers",
|
||||
"value", distinctPartiesOfSpeakers));
|
||||
metadataCollection.replaceOne(filterPartiesOfSpeakers, partiesDocOfSpeakers, new com.mongodb.client.model.ReplaceOptions().upsert(true));
|
||||
|
||||
|
||||
Document filterTopics = new Document("type", "topics");
|
||||
Document topicsDoc = MongoDBHandler.createDocument(false, Map.of("type", "topics",
|
||||
"value", topicsList));
|
||||
metadataCollection.replaceOne(filterTopics, topicsDoc, new com.mongodb.client.model.ReplaceOptions().upsert(true));
|
||||
|
||||
Logger.info("Updating Metadata Collection: end");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue