From e6ef7adc6c6be19472e54dd9db79bbf6293cb3d5 Mon Sep 17 00:00:00 2001 From: s5260822 Date: Tue, 18 Mar 2025 17:18:50 +0100 Subject: [PATCH] merge conflict reslove --- .../project/gruppe_05_1/Main.java | 42 +++++- .../gruppe_05_1/database/MongoDBHandler.java | 11 +- .../gruppe_05_1/database/MongoPprUtils.java | 33 ++++- .../gruppe_05_1/domain/html/HtmlSpeech.java | 46 ++++++- .../gruppe_05_1/domain/nlp/AudioToken.java | 87 ++++++++++++ .../gruppe_05_1/domain/nlp/Dependency.java | 64 +++++++++ .../gruppe_05_1/domain/nlp/NamedEntity.java | 55 ++++++++ .../gruppe_05_1/domain/nlp/NlpInfo.java | 123 +++++++++++++++++ .../project/gruppe_05_1/domain/nlp/Pos.java | 119 ++++++++++++++++ .../gruppe_05_1/domain/nlp/Sentence.java | 44 ++++++ .../gruppe_05_1/domain/nlp/Sentiment.java | 97 +++++++++++++ .../project/gruppe_05_1/domain/nlp/Token.java | 96 +++++++++++++ .../project/gruppe_05_1/domain/nlp/Topic.java | 107 +++++++++++++++ .../domain/nlp/VideoInformation.java | 43 ++++++ .../domain/nlp/html/SentimentOfSentence.java | 112 +++++++++++++++ .../project/gruppe_05_1/nlp/NlpUtils.java | 2 + .../project/gruppe_05_1/nlp/XmiExtractor.java | 128 +++++++++--------- .../rest/ParlamentarierController.java | 5 + .../gruppe_05_1/rest/SpeechController.java | 36 +++++ .../project/gruppe_05_1/util/PPRUtils.java | 59 ++++++++ .../xml/speeches/SpeechParser.java | 21 ++- .../members_of_parliament_image_crawler.py | 2 +- .../templates/parlamentarierDetails.ftl | 3 +- src/main/resources/templates/speech.ftl | 18 ++- .../resources/templates/topicsBubbleChart.ftl | 1 - 25 files changed, 1266 insertions(+), 88 deletions(-) create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/AudioToken.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Dependency.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NamedEntity.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NlpInfo.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Pos.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentence.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentiment.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Token.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Topic.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/VideoInformation.java create mode 100644 src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/html/SentimentOfSentence.java diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java index 05eb9c7..95a20a5 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/Main.java @@ -8,10 +8,18 @@ import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler; import org.texttechnologylab.project.gruppe_05_1.util.Logger; import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils; import org.texttechnologylab.project.gruppe_05_1.xml.FileObjectFactory; +import org.texttechnologylab.project.gruppe_05_1.xml.speeches.SpeechParser; +import org.w3c.dom.Document; + import java.util.Arrays; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; import static java.lang.Boolean.FALSE; import static java.lang.Boolean.TRUE; +import static org.texttechnologylab.project.gruppe_05_1.util.PPRUtils.checkAndProcessNewProtocols; public class Main { public static boolean UPLOAD_MEMBER_PHOTOS; @@ -116,13 +124,33 @@ public class Main { Logger.pink("Uploading Member Photos to DB..."); mongoDBHandler.uploadMemberPhotos(); } - mongoDBHandler.close(); - try { - NlpUtils.runRemoteDriver(); - } catch (Exception e) { - Logger.error("Error while running NLP remote driver"); - Logger.error(e.getMessage()); - } + NlpUtils.runRemoteDriver(); + /*ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); + scheduler.scheduleAtFixedRate(() -> { + try { + NlpUtils.runRemoteDriver(); + } catch (Exception e) { + Logger.error("Error while running NLP remote driver"); + Logger.error(e.getMessage()); + } + try { + Logger.info("Starte Aktualisierung der Protokolle..."); + Set newProtocols = checkAndProcessNewProtocols(mongoDBHandler); + Logger.info("Neue Protokolle gefunden: " + newProtocols.size()); + if (newProtocols.isEmpty()) { + Logger.info("Keine neuen Protokolle gefunden, Upload wird übersprungen."); + } else { + SpeechParser speechParser = new SpeechParser(); + mongoDBHandler.insertSessions(speechParser.parseAllSessions(newProtocols)); + mongoDBHandler.insertAgendaItems(speechParser.getAgendaItems()); + mongoDBHandler.insertSpeeches(speechParser.getSpeeches()); + Logger.info("Neuer Protokolle uploaded: " + newProtocols.size()); + } + } catch (Exception ex) { + Logger.error("Fehler bei der Protokollaktualisierung: " + ex.getMessage()); + } + }, 0, 10, TimeUnit.MINUTES);*/ + RESTHandler restHandler = new RESTHandler(); restHandler.startJavalin(); diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java index 6ac6e28..b7de73c 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoDBHandler.java @@ -3,6 +3,7 @@ package org.texttechnologylab.project.gruppe_05_1.database; import com.mongodb.MongoClientSettings; import com.mongodb.MongoCredential; import com.mongodb.ServerAddress; +import com.mongodb.WriteConcern; import com.mongodb.bulk.BulkWriteResult; import com.mongodb.client.MongoClient; import com.mongodb.client.MongoClients; @@ -691,7 +692,9 @@ public class MongoDBHandler { public void bulkWriteNlpData(List> bulkOperations) { if (!bulkOperations.isEmpty()) { - BulkWriteResult result = speechesCollection.bulkWrite(bulkOperations); + BulkWriteOptions options = new BulkWriteOptions().ordered(false); + // Optional: Setze einen weniger strengen Write Concern + BulkWriteResult result = speechesCollection.bulkWrite(bulkOperations, options); int modifiedCount = result.getModifiedCount(); int matchedCount = result.getMatchedCount(); int upsertCount = result.getUpserts().size(); @@ -764,6 +767,12 @@ public class MongoDBHandler { } } + public boolean sessionExists(String sessionNumber) { + Document filter = new Document("sessionNumber", sessionNumber); + long count = sessionsCollection.countDocuments(filter); + return count > 0; + } + public String getMemberPhoto(String memberId) { Document photoDocument = memberPhotoCollection.find(eq("memberId", memberId)).first(); if (photoDocument == null) { diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java index 22ce309..d86c4d9 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/database/MongoPprUtils.java @@ -8,6 +8,7 @@ import org.texttechnologylab.project.gruppe_05_1.database.domainimp.speeches.Spe import org.texttechnologylab.project.gruppe_05_1.domain.html.HtmlSpeech; import org.texttechnologylab.project.gruppe_05_1.domain.html.Parlamentarier; import org.texttechnologylab.project.gruppe_05_1.domain.html.ParlamentarierDetails; +import org.texttechnologylab.project.gruppe_05_1.domain.nlp.*; import org.texttechnologylab.project.gruppe_05_1.domain.speaker.Membership; import org.texttechnologylab.project.gruppe_05_1.domain.speech.SpeechMetaData; import org.texttechnologylab.project.gruppe_05_1.util.GeneralUtils; @@ -162,6 +163,24 @@ public class MongoPprUtils { return p; } + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + // Parlamentarier - Picture + + /** + * + * @param id : ID des Parlamentariers + * @return Das Foto (als Base64-encoded String) + */ + public static String getParlamentarierPictureByID(String id) { + Document doc = MongoDBHandler.findFirstDocumentInCollection(getPicturesCollection(), "memberId", id); + if (doc == null) { + return null; + } else return doc.getString("base64"); + } + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -362,7 +381,11 @@ public class MongoPprUtils { // Sortiere nach Datum, absteigend speechMetaDataList.sort((md1, md2) -> { - return md2.getDateTime().compareTo(md1.getDateTime()); + try { + return md2.getDateTime().compareTo(md1.getDateTime()); + } catch (NullPointerException e) { + return 0; + } }); return speechMetaDataList; @@ -399,6 +422,14 @@ public class MongoPprUtils { } } + /** + * Liefert die Rede-Informationen für die Anzeige einer Rede: + * - die Rede-ID + * - Name und Fraktion des Redners + * - Die Inhalte der Rede + * @param key: Rede ID + * @return + */ public static HtmlSpeech getSpeechByKey(String key) { Document filter = new Document("speechKey", key); Document speechDoc = getSpeechCollection().find(filter).first(); diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/html/HtmlSpeech.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/html/HtmlSpeech.java index 3b4080e..07a4fd5 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/html/HtmlSpeech.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/html/HtmlSpeech.java @@ -2,6 +2,9 @@ package org.texttechnologylab.project.gruppe_05_1.domain.html; import org.bson.Document; import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler; +import org.texttechnologylab.project.gruppe_05_1.domain.nlp.NlpInfo; +import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Token; +import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Topic; import java.util.ArrayList; import java.util.List; @@ -13,6 +16,7 @@ public class HtmlSpeech { String speakerName; String fraction; List content = new ArrayList<>(); + NlpInfo nlp = null; public HtmlSpeech() { } @@ -30,6 +34,33 @@ public class HtmlSpeech { addContent(new SpeechContent(contentDoc)); } } + + Document nlpDoc = (Document) doc.get("analysisResults"); + nlp = readNlpInfo(nlpDoc); + } + + private NlpInfo readNlpInfo(Document nlpDoc) { + if (nlpDoc == null) return null; + NlpInfo nlp = new NlpInfo(); + + // TODO: HERE + List tokensDocs = nlpDoc.get("tokens", MongoDBHandler.DOC_LIST_CLASS); + nlp.setTokens(Token.readTokensFromMongo(tokensDocs)); + + List sentencesDocs = nlpDoc.get("sentences", MongoDBHandler.DOC_LIST_CLASS); + + List dependenciesDocs = nlpDoc.get("dependencies", MongoDBHandler.DOC_LIST_CLASS); + + List namedEntitiesDocs = nlpDoc.get("namedEntities", MongoDBHandler.DOC_LIST_CLASS); + + List sentimentsDocs = nlpDoc.get("sentiments", MongoDBHandler.DOC_LIST_CLASS); + + List topicsDocs = nlpDoc.get("topics", MongoDBHandler.DOC_LIST_CLASS); + nlp.setTopics(Topic.readTopicsFromMongo(topicsDocs)); + + // TODO: Video + + return nlp; } public String getSpeechKey() { @@ -68,16 +99,26 @@ public class HtmlSpeech { content.add(contentLine); } + public NlpInfo getNlp() { + return nlp; + } + + public void setNlp(NlpInfo nlp) { + this.nlp = nlp; + } + @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof HtmlSpeech that)) return false; - return Objects.equals(speechKey, that.speechKey) && Objects.equals(speakerName, that.speakerName) && Objects.equals(fraction, that.fraction) && Objects.equals(content, that.content); + return Objects.equals(speechKey, that.speechKey) && Objects.equals(speakerName, that.speakerName) + && Objects.equals(fraction, that.fraction) && Objects.equals(content, that.content) + && Objects.equals(nlp, that.nlp); } @Override public int hashCode() { - return Objects.hash(speechKey, speakerName, fraction, content); + return Objects.hash(speechKey, speakerName, fraction, content, nlp); } @Override @@ -87,6 +128,7 @@ public class HtmlSpeech { .add("speakerName='" + speakerName + "'") .add("fraction='" + fraction + "'") .add("content=" + content) + .add("nlp=" + nlp) .toString(); } } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/AudioToken.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/AudioToken.java new file mode 100644 index 0000000..de72e1d --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/AudioToken.java @@ -0,0 +1,87 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.Objects; +import java.util.StringJoiner; + +public class AudioToken { + + private int begin; + private int end; + private double timeStart; + private double timeEnd; + private String value; + + public AudioToken() { + } + + public AudioToken(int begin, int end, double timeStart, double timeEnd, String value) { + this.begin = begin; + this.end = end; + this.timeStart = timeStart; + this.timeEnd = timeEnd; + this.value = value; + } + + public int getBegin() { + return begin; + } + + public void setBegin(int begin) { + this.begin = begin; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + public double getTimeStart() { + return timeStart; + } + + public void setTimeStart(double timeStart) { + this.timeStart = timeStart; + } + + public double getTimeEnd() { + return timeEnd; + } + + public void setTimeEnd(double timeEnd) { + this.timeEnd = timeEnd; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof AudioToken that)) return false; + return begin == that.begin && end == that.end && Double.compare(timeStart, that.timeStart) == 0 && Double.compare(timeEnd, that.timeEnd) == 0 && Objects.equals(value, that.value); + } + + @Override + public int hashCode() { + return Objects.hash(begin, end, timeStart, timeEnd, value); + } + + @Override + public String toString() { + return new StringJoiner(", ", AudioToken.class.getSimpleName() + "[", "]") + .add("begin=" + begin) + .add("end=" + end) + .add("timeStart=" + timeStart) + .add("timeEnd=" + timeEnd) + .add("value='" + value + "'") + .toString(); + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Dependency.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Dependency.java new file mode 100644 index 0000000..ad5ead6 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Dependency.java @@ -0,0 +1,64 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.Objects; +import java.util.StringJoiner; + +public class Dependency { + String type; + String governor; + String dependent; + + public Dependency() { + } + + public Dependency(String type, String governor, String dependent) { + this.type = type; + this.governor = governor; + this.dependent = dependent; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getGovernor() { + return governor; + } + + public void setGovernor(String governor) { + this.governor = governor; + } + + public String getDependent() { + return dependent; + } + + public void setDependent(String dependent) { + this.dependent = dependent; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Dependency that)) return false; + return Objects.equals(type, that.type) && Objects.equals(governor, that.governor) && Objects.equals(dependent, that.dependent); + } + + @Override + public int hashCode() { + return Objects.hash(type, governor, dependent); + } + + @Override + public String toString() { + return new StringJoiner(", ", Dependency.class.getSimpleName() + "[", "]") + .add("type='" + type + "'") + .add("governor='" + governor + "'") + .add("dependent='" + dependent + "'") + .toString(); + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NamedEntity.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NamedEntity.java new file mode 100644 index 0000000..0f3ec5d --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NamedEntity.java @@ -0,0 +1,55 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.Objects; +import java.util.StringJoiner; + +public class NamedEntity { + String type; // PER, LOC etc. + // int begin; // TODO: momentan nicht in MongoDB + // int end; // TODO: momentan nicht in MongoDB + String text; + + public NamedEntity() { + } + + public NamedEntity(String type, String text) { + this.type = type; + this.text = text; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof NamedEntity that)) return false; + return Objects.equals(type, that.type) && Objects.equals(text, that.text); + } + + @Override + public int hashCode() { + return Objects.hash(type, text); + } + + @Override + public String toString() { + return new StringJoiner(", ", NamedEntity.class.getSimpleName() + "[", "]") + .add("type='" + type + "'") + .add("text='" + text + "'") + .toString(); + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NlpInfo.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NlpInfo.java new file mode 100644 index 0000000..de7ecd5 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/NlpInfo.java @@ -0,0 +1,123 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.StringJoiner; + +public class NlpInfo { + List tokens; + List sentences; + List dependencies; + List namedEntities; + Sentiment overallSentiment; // Sentiment for the whole text ; kann null sein! + List sentiments; // sentiments for the respective sentences (eine Liste von 0..n Elementen) + List topics; + List posList; + + VideoInformation videoInformation; + + public List getTokens() { + return tokens; + } + + public void setTokens(List tokens) { + this.tokens = tokens; + } + + public List getSentences() { + return sentences; + } + + public void setSentences(List sentences) { + this.sentences = sentences; + } + + public List getDependencies() { + return dependencies; + } + + public void setDependencies(List dependencies) { + this.dependencies = dependencies; + } + + public List getNamedEntities() { + return namedEntities; + } + + public void setNamedEntities(List namedEntities) { + this.namedEntities = namedEntities; + } + + public Sentiment getOverallSentiment() { + return overallSentiment; + } + + public void setOverallSentiment(Sentiment overallSentiment) { + this.overallSentiment = overallSentiment; + } + + public List getSentiments() { + return sentiments; + } + + public void setSentiments(List sentiments) { + this.sentiments = sentiments; + } + + public List getTopics() { + return topics; + } + + public void setTopics(List topics) { + this.topics = topics; + } + + public List getPosList() { + return posList; + } + + public void setPosList(List posList) { + this.posList = posList; + } + + public VideoInformation getVideoInformation() { + return videoInformation; + } + + public void setVideoInformation(VideoInformation videoInformation) { + this.videoInformation = videoInformation; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof NlpInfo nlpInfo)) return false; + return Objects.equals(tokens, nlpInfo.tokens) && Objects.equals(sentences, nlpInfo.sentences) + && Objects.equals(dependencies, nlpInfo.dependencies) && Objects.equals(namedEntities, nlpInfo.namedEntities) + && Objects.equals(overallSentiment, nlpInfo.overallSentiment) && Objects.equals(sentiments, nlpInfo.sentiments) + && Objects.equals(topics, nlpInfo.topics) && Objects.equals(posList, nlpInfo.posList) + && Objects.equals(videoInformation, nlpInfo.videoInformation); + } + + @Override + public int hashCode() { + return Objects.hash(tokens, sentences, dependencies, namedEntities, overallSentiment, sentiments, topics, posList, videoInformation); + } + + @Override + public String toString() { + return new StringJoiner(", ", NlpInfo.class.getSimpleName() + "[", "]") + .add("tokens=" + tokens) + .add("sentences=" + sentences) + .add("dependencies=" + dependencies) + .add("namedEntities=" + namedEntities) + .add("overallSentiment=" + overallSentiment) + .add("sentiments=" + sentiments) + .add("topics=" + topics) + .add("posList=" + posList) + .add("videoInformation=" + videoInformation) + .toString(); + } + +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Pos.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Pos.java new file mode 100644 index 0000000..74f027a --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Pos.java @@ -0,0 +1,119 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.Objects; +import java.util.StringJoiner; + +public class Pos { + String posValue; // ART, NN... + String coarseValue; // PROPN... + int begin; + int end; + String coveredText; + + // Am Dateiende stehen beispiele für mögliche Werte + + + public Pos() { + } + + public Pos(String posValue, String coarseValue, int begin, int end, String coveredText) { + this.posValue = posValue; + this.coarseValue = coarseValue; + this.begin = begin; + this.end = end; + this.coveredText = coveredText; + } + + public String getPosValue() { + return posValue; + } + + public void setPosValue(String posValue) { + this.posValue = posValue; + } + + public String getCoarseValue() { + return coarseValue; + } + + public void setCoarseValue(String coarseValue) { + this.coarseValue = coarseValue; + } + + public int getBegin() { + return begin; + } + + public void setBegin(int begin) { + this.begin = begin; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + public String getCoveredText() { + return coveredText; + } + + public void setCoveredText(String coveredText) { + this.coveredText = coveredText; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Pos pos)) return false; + return begin == pos.begin && end == pos.end && Objects.equals(posValue, pos.posValue) && Objects.equals(coarseValue, pos.coarseValue) && Objects.equals(coveredText, pos.coveredText); + } + + @Override + public int hashCode() { + return Objects.hash(posValue, coarseValue, begin, end, coveredText); + } + + @Override + public String toString() { + return new StringJoiner(", ", Pos.class.getSimpleName() + "[", "]") + .add("posValue='" + posValue + "'") + .add("coarseValue='" + coarseValue + "'") + .add("begin=" + begin) + .add("end=" + end) + .add("coveredText='" + coveredText + "'") + .toString(); + } + + /* Beispielswerte: + + MyPos{posValue='ART', coarseValue='DET', begin=0, end=3, coveredText='Die'}, + MyPos{posValue='NN', coarseValue='NOUN', begin=4, end=8, coveredText='Idee'}, + MyPos{posValue='APPR', coarseValue='ADP', begin=9, end=12, coveredText='von'}, + MyPos{posValue='NE', coarseValue='PROPN', begin=13, end=16, coveredText='Joe'}, + MyPos{posValue='NN', coarseValue='PROPN', begin=17, end=22, coveredText='Biden'}, + MyPos{posValue='APPR', coarseValue='ADP', begin=23, end=26, coveredText='aus'}, + MyPos{posValue='NE', coarseValue='PROPN', begin=27, end=36, coveredText='Bucharest'}, + MyPos{posValue='$,', coarseValue='PUNCT', begin=36, end=37, coveredText=','}, + MyPos{posValue='NE', coarseValue='PROPN', begin=38, end=46, coveredText='Rumänien'}, + MyPos{posValue='$,', coarseValue='PUNCT', begin=46, end=47, coveredText=','}, + MyPos{posValue='VVFIN', coarseValue='VERB', begin=48, end=53, coveredText='finde'}, + MyPos{posValue='PPER', coarseValue='PRON', begin=54, end=57, coveredText='ich'}, + MyPos{posValue='ADJD', coarseValue='ADV', begin=58, end=61, coveredText='gut'}, + MyPos{posValue='$.', coarseValue='PUNCT', begin=61, end=62, coveredText='.'}, + MyPos{posValue='ART', coarseValue='DET', begin=63, end=66, coveredText='Den'}, + MyPos{posValue='NN', coarseValue='NOUN', begin=67, end=76, coveredText='Vorschlag'}, + MyPos{posValue='APPR', coarseValue='ADP', begin=77, end=80, coveredText='von'}, + MyPos{posValue='NE', coarseValue='PROPN', begin=81, end=87, coveredText='Donald'}, + MyPos{posValue='NE', coarseValue='PROPN', begin=88, end=93, coveredText='Trump'}, + MyPos{posValue='APPR', coarseValue='ADP', begin=94, end=97, coveredText='aus'}, + MyPos{posValue='NE', coarseValue='PROPN', begin=98, end=108, coveredText='Frankreich'}, + MyPos{posValue='VVFIN', coarseValue='VERB', begin=109, end=114, coveredText='finde'}, + MyPos{posValue='PPER', coarseValue='PRON', begin=115, end=118, coveredText='ich'}, + MyPos{posValue='ADV', coarseValue='ADV', begin=119, end=126, coveredText='weniger'}, + MyPos{posValue='ADJD', coarseValue='ADV', begin=127, end=130, coveredText='gut'}, + MyPos{posValue='$.', coarseValue='PUNCT', begin=130, end=131, coveredText='.'}], + */ +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentence.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentence.java new file mode 100644 index 0000000..213f58e --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentence.java @@ -0,0 +1,44 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.Objects; +import java.util.StringJoiner; + +public class Sentence { + // int begin; // TODO: momentan nicht in MongoDB + // int end; // TODO: momentan nicht in MongoDB + String text; + + public Sentence() { + } + + public Sentence(String text) { + this.text = text; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Sentence sentence)) return false; + return Objects.equals(text, sentence.text); + } + + @Override + public int hashCode() { + return Objects.hash(text); + } + + @Override + public String toString() { + return new StringJoiner(", ", Sentence.class.getSimpleName() + "[", "]") + .add("text='" + text + "'") + .toString(); + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentiment.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentiment.java new file mode 100644 index 0000000..a2f04e3 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Sentiment.java @@ -0,0 +1,97 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.Objects; +import java.util.StringJoiner; + +public class Sentiment { + int begin; + int end; + double sentiment; // overall sentiment + double negative; + double neutral; + double positive; + + public Sentiment() { + } + + public Sentiment(int begin, int end, double sentiment, double negative, double neutral, double positive) { + this.begin = begin; + this.end = end; + this.sentiment = sentiment; + this.negative = negative; + this.neutral = neutral; + this.positive = positive; + } + + public int getBegin() { + return begin; + } + + public void setBegin(int begin) { + this.begin = begin; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + public double getSentiment() { + return sentiment; + } + + public void setSentiment(double sentiment) { + this.sentiment = sentiment; + } + + public double getNegative() { + return negative; + } + + public void setNegative(double negative) { + this.negative = negative; + } + + public double getNeutral() { + return neutral; + } + + public void setNeutral(double neutral) { + this.neutral = neutral; + } + + public double getPositive() { + return positive; + } + + public void setPositive(double positive) { + this.positive = positive; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Sentiment sentiment1)) return false; + return begin == sentiment1.begin && end == sentiment1.end && Double.compare(sentiment, sentiment1.sentiment) == 0 && Double.compare(negative, sentiment1.negative) == 0 && Double.compare(neutral, sentiment1.neutral) == 0 && Double.compare(positive, sentiment1.positive) == 0; + } + + @Override + public int hashCode() { + return Objects.hash(begin, end, sentiment, negative, neutral, positive); + } + + @Override + public String toString() { + return new StringJoiner(", ", Sentiment.class.getSimpleName() + "[", "]") + .add("begin=" + begin) + .add("end=" + end) + .add("sentiment=" + sentiment) + .add("negative=" + negative) + .add("neutral=" + neutral) + .add("positive=" + positive) + .toString(); + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Token.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Token.java new file mode 100644 index 0000000..09a5365 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Token.java @@ -0,0 +1,96 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import org.bson.Document; + +import java.util.*; + +public class Token { + String text; + String pos; + String lemma; + + public Token() { + } + + public Token(String text, String pos, String lemma) { + this.text = text; + this.pos = pos; + this.lemma = lemma; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public String getPos() { + return pos; + } + + public void setPos(String pos) { + this.pos = pos; + } + + public String getLemma() { + return lemma; + } + + public void setLemma(String lemma) { + this.lemma = lemma; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Token token)) return false; + return Objects.equals(text, token.text) && Objects.equals(pos, token.pos) && Objects.equals(lemma, token.lemma); + } + + @Override + public int hashCode() { + return Objects.hash(text, pos, lemma); + } + + @Override + public String toString() { + return new StringJoiner(", ", Token.class.getSimpleName() + "[", "]") + .add("text='" + text + "'") + .add("pos='" + pos + "'") + .add("lemma='" + lemma + "'") + .toString(); + } + + /** + * Die Token-Dokumente (Speech --> analysisResults --> token) aus der MongoDB lesen + * @param tokenDocs Eine Liste von Mongo-Dokumenten + * @return Eine Liste der Token + */ + public static List readTokensFromMongo(List tokenDocs) { + List tokens = new ArrayList<>(); + for (Document doc : tokenDocs) { + tokens.add(new Token(doc.getString("text"), + doc.getString("pos"), + doc.getString("lemma") + )); + } + return tokens; + } + + /** + * Zählt alle verschiedenen POS Vorkommen auf + * @param tokenList + * @return Jede POS art mit ihrer Anzahl an Vorkommen + */ + public static Map countPOS(List tokenList) { + Map posCounts = new HashMap<>(); + + for (Token token : tokenList) { + posCounts.put(token.getPos(), posCounts.getOrDefault(token.getPos(), 0) + 1); + } + + return posCounts; + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Topic.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Topic.java new file mode 100644 index 0000000..474f5aa --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/Topic.java @@ -0,0 +1,107 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import org.bson.Document; + +import java.util.*; +import java.util.stream.Collectors; + +public class Topic { + String topic; + Double score; + String text; + + public Topic() { + } + + public Topic(String topic, Double score, String text) { + this.topic = topic; + this.score = score; + this.text = text; + } + + public String getTopic() { + return topic; + } + + public void setTopic(String topic) { + this.topic = topic; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Topic topic1)) return false; + return Double.compare(score, topic1.score) == 0 && Objects.equals(topic, topic1.topic) && Objects.equals(text, topic1.text); + } + + @Override + public int hashCode() { + return Objects.hash(topic, score, text); + } + + @Override + public String toString() { + return new StringJoiner(", ", Topic.class.getSimpleName() + "[", "]") + .add("topic='" + topic + "'") + .add("score=" + score) + .add("text='" + text + "'") + .toString(); + } + + + /** + * Die Topics-Dokumente (Speech --> analysisResults --> topics) aus der MongoDB lesen + * @param topicsDocs Eine Liste von Mongo-Dokumenten + * @return Eine Liste der Topics + */ + public static List readTopicsFromMongo(List topicsDocs) { + List topics = new ArrayList<>(); + for (Document doc : topicsDocs) { + topics.add(new Topic(doc.getString("topic"), + doc.getDouble("score"), + doc.getString("text") + )); + } + return topics; + } + + + /** + * Topic-Informationen "verdichten": + * Ausgangssituation: eine Liste mit mehreren Topics. Ein Topic kann in dieser Liste mehrfach vorkommen. + * Man will wissen, welche Score hat jeden Topic. Hier werden die Werte der jeweiligen Topics summiert. + * + * @param topicsList + * @return Map + */ + public static Map condenseTopicInformation(List topicsList) { + Map condensedTopicInfo = new HashMap<>(); + + for (Topic t : topicsList) { + Double oldValue = condensedTopicInfo.get(t.getTopic()); + if (oldValue != null) { + condensedTopicInfo.replace(t.getTopic(), oldValue + t.getScore()); + } else { + condensedTopicInfo.put(t.getTopic(), t.getScore()); + } + } + + return condensedTopicInfo; + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/VideoInformation.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/VideoInformation.java new file mode 100644 index 0000000..e33db12 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/VideoInformation.java @@ -0,0 +1,43 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp; + +import java.util.List; +import java.util.Objects; +import java.util.StringJoiner; + +public class VideoInformation { + List audioTokens; + + public VideoInformation() { + } + + public VideoInformation(List audioTokens) { + this.audioTokens = audioTokens; + } + + public List getAudioTokens() { + return audioTokens; + } + + public void setAudioTokens(List audioTokens) { + this.audioTokens = audioTokens; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof VideoInformation that)) return false; + return Objects.equals(audioTokens, that.audioTokens); + } + + @Override + public int hashCode() { + return Objects.hash(audioTokens); + } + + @Override + public String toString() { + return new StringJoiner(", ", VideoInformation.class.getSimpleName() + "[", "]") + .add("audioTokens=" + audioTokens) + .toString(); + } +} diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/html/SentimentOfSentence.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/html/SentimentOfSentence.java new file mode 100644 index 0000000..d96dcf0 --- /dev/null +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/domain/nlp/html/SentimentOfSentence.java @@ -0,0 +1,112 @@ +package org.texttechnologylab.project.gruppe_05_1.domain.nlp.html; + +import java.util.Objects; + +/** + * Diese Klasse ordnet das entspreche Sentiment zu einem Satz zu. + * Sie ist ein Datencontainer für die Darstellung über FreeMarker + */ +public class SentimentOfSentence { + int begin; + int end; + String text; + // RGBA Werte für die Darstellung + float sentiment; // overall sentiment --> wird für den alpha (Opaque) Wert verwendet --> 0..1 + int negative; // red --> 0..255 + int neutral; // 0..255, wird momentan nicht benutzt + int positive; // green --> 0..255 + + public SentimentOfSentence() {} + + public SentimentOfSentence(int begin, int end, String text, float sentiment, int negative, int neutral, int positive) { + this.begin = begin; + this.end = end; + this.text = text; + this.sentiment = sentiment; + this.negative = negative; + this.neutral = neutral; + this.positive = positive; + } + + public int getBegin() { + return begin; + } + + public void setBegin(int begin) { + this.begin = begin; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public float getSentiment() { + return sentiment; + } + + public void setSentiment(float sentiment) { + this.sentiment = sentiment; + } + + public int getNegative() { + return negative; + } + + public void setNegative(int negative) { + this.negative = negative; + } + + public int getNeutral() { + return neutral; + } + + public void setNeutral(int neutral) { + this.neutral = neutral; + } + + public int getPositive() { + return positive; + } + + public void setPositive(int positive) { + this.positive = positive; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof SentimentOfSentence that)) return false; + return begin == that.begin && end == that.end && Double.compare(sentiment, that.sentiment) == 0 && Double.compare(negative, that.negative) == 0 && Double.compare(neutral, that.neutral) == 0 && Double.compare(positive, that.positive) == 0 && Objects.equals(text, that.text); + } + + @Override + public int hashCode() { + return Objects.hash(begin, end, text, sentiment, negative, neutral, positive); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SentimentOfSentence{"); + sb.append("begin=").append(begin); + sb.append(", end=").append(end); + sb.append(", text='").append(text).append('\''); + sb.append(", sentiment=").append(sentiment); + sb.append(", negative=").append(negative); + sb.append(", neutral=").append(neutral); + sb.append(", positive=").append(positive); + sb.append('}'); + return sb.toString(); + } +} + diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java index 6c4c37a..e01e647 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/NlpUtils.java @@ -360,6 +360,8 @@ public class NlpUtils { bulkOperations.add(new UpdateOneModel<>(updateFilter, update)); } if (!bulkOperations.isEmpty()) { + System.out.println("Processing of " + bulkOperations.size() + " documents finished"); + System.out.println("uploading..."); mongoDBHandler.bulkWriteNlpData(bulkOperations); Logger.debug("Bulk write completed for " + bulkOperations.size() + " documents."); mongoDBHandler.close(); diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java index 30566ff..89fee0d 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/nlp/XmiExtractor.java @@ -1,15 +1,16 @@ package org.texttechnologylab.project.gruppe_05_1.nlp; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; import com.mongodb.client.model.Filters; import com.mongodb.client.model.UpdateOneModel; import com.mongodb.client.model.WriteModel; -import com.mongodb.client.result.UpdateResult; import org.apache.uima.fit.util.JCasUtil; import org.bson.Document; import java.io.*; -import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.*; import java.util.*; import java.util.stream.Collectors; @@ -18,12 +19,7 @@ import org.bson.conversions.Bson; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.cas.impl.XmiCasDeserializer; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler; -import org.hucompute.textimager.uima.type.category.CategoryCoveredTagged; import org.texttechnologylab.project.gruppe_05_1.util.Logger; import static org.texttechnologylab.project.gruppe_05_1.Main.JCAS_SPEECHES_TYPESYSTEM_DIR; @@ -31,29 +27,45 @@ import static org.texttechnologylab.project.gruppe_05_1.Main.JCAS_SPEECHES_TYPES public class XmiExtractor { - private List> bulkOperations; - private MongoDBHandler mongoDBHandler; + private final List> bulkOperations = Collections.synchronizedList(new ArrayList<>()); + private final MongoDBHandler mongoDBHandler; private static final int BATCH_SIZE = 1000; - private int processedCount = 0; + private static final AtomicInteger processedCount = new AtomicInteger(0); + public XmiExtractor() { mongoDBHandler = new MongoDBHandler(); - this.bulkOperations = new ArrayList<>(); } public void extractAndUploadXmiData() throws IOException { InputStream resourceStream = getClass().getClassLoader().getResourceAsStream("speeches/20.zip"); + if (resourceStream == null) { + throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches"); + } + ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); + List> futures = new ArrayList<>(); try (ZipInputStream zis = new ZipInputStream(resourceStream)) { ZipEntry entry; while ((entry = zis.getNextEntry()) != null) { if (entry.getName().endsWith(".xmi.gz")) { - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - byte[] buffer = new byte[1024]; - int len; - while ((len = zis.read(buffer)) > 0) { - baos.write(buffer, 0, len); + File tempFile = File.createTempFile("xmi_entry_", ".xmi.gz"); + try (FileOutputStream fos = new FileOutputStream(tempFile)) { + byte[] buffer = new byte[1024]; + int len; + while ((len = zis.read(buffer)) > 0) { + fos.write(buffer, 0, len); + } } - byte[] entryData = baos.toByteArray(); - processXmiGzStream(new ByteArrayInputStream(entryData), entry.getName()); + ZipEntry finalEntry = entry; + Future future = executor.submit(() -> { + try (FileInputStream fis = new FileInputStream(tempFile)) { + processXmiGzStream(fis, finalEntry.getName()); + } catch (IOException e) { + e.printStackTrace(); + } finally { + tempFile.delete(); + } + }); + futures.add(future); } zis.closeEntry(); } @@ -61,7 +73,16 @@ public class XmiExtractor { Logger.error("Error reading XMI data from ZIP file."); Logger.error(e.getMessage()); } - flushBatch(); + for (Future future : futures) { + try { + future.get(); + } catch (Exception e) { + e.printStackTrace(); + } + } + executor.shutdown(); + flushBatch(); // Synchronously upload the remaining batch + mongoDBHandler.close(); } private void processXmiGzStream(InputStream inputStream, String filename) { @@ -70,12 +91,10 @@ public class XmiExtractor { jCas = JCasFactory.createJCas(JCAS_SPEECHES_TYPESYSTEM_DIR); XmiCasDeserializer.deserialize(gis, jCas.getCas(), true); - // Build structured analysisResults Document Document analysisResults = new Document(); - // Tokens: Include POS, Lemma, etc. List tokens = new ArrayList<>(); - for (Token token : JCasUtil.select(jCas, Token.class)) { + for (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token token : JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token.class)) { Document tokenDoc = new Document() .append("text", token.getCoveredText()) .append("pos", token.getPos().getPosValue()) @@ -84,15 +103,13 @@ public class XmiExtractor { } analysisResults.append("tokens", tokens); - // Sentences - List sentences = JCasUtil.select(jCas, Sentence.class).stream() - .map(Sentence::getCoveredText) + List sentences = JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence.class).stream() + .map(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence::getCoveredText) .collect(Collectors.toList()); analysisResults.append("sentences", sentences); - // Dependencies List dependencies = new ArrayList<>(); - for (Dependency dep : JCasUtil.select(jCas, Dependency.class)) { + for (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency dep : JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency.class)) { Document depDoc = new Document() .append("type", dep.getDependencyType()) .append("governor", dep.getGovernor().getCoveredText()) @@ -101,9 +118,8 @@ public class XmiExtractor { } analysisResults.append("dependencies", dependencies); - // Named Entities List namedEntities = new ArrayList<>(); - for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) { + for (de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity ne : JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity.class)) { Document neDoc = new Document() .append("text", ne.getCoveredText()) .append("type", ne.getValue()); @@ -111,23 +127,16 @@ public class XmiExtractor { } analysisResults.append("namedEntities", namedEntities); - // Sentiment List sentiments = new ArrayList<>(); - for (org.hucompute.textimager.uima.type.Sentiment sentiment : - JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) { - + for (org.hucompute.textimager.uima.type.Sentiment sentiment : JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) { Document sentimentDoc = new Document() .append("begin", sentiment.getBegin()) .append("end", sentiment.getEnd()) .append("score", sentiment.getSentiment()) .append("subjectivity", sentiment.getSubjectivity()); - - // Check for VaderSentiment subtype if (sentiment instanceof org.hucompute.textimager.uima.type.VaderSentiment) { - org.hucompute.textimager.uima.type.VaderSentiment vader = - (org.hucompute.textimager.uima.type.VaderSentiment) sentiment; - sentimentDoc - .append("pos", vader.getPos()) + org.hucompute.textimager.uima.type.VaderSentiment vader = (org.hucompute.textimager.uima.type.VaderSentiment) sentiment; + sentimentDoc.append("pos", vader.getPos()) .append("neu", vader.getNeu()) .append("neg", vader.getNeg()); } @@ -136,39 +145,36 @@ public class XmiExtractor { analysisResults.append("sentiments", sentiments); List topics = new ArrayList<>(); - for (CategoryCoveredTagged topic : JCasUtil.select(jCas, CategoryCoveredTagged.class)) { + for (org.hucompute.textimager.uima.type.category.CategoryCoveredTagged topic : JCasUtil.select(jCas, org.hucompute.textimager.uima.type.category.CategoryCoveredTagged.class)) { Document topicDoc = new Document() .append("topic", topic.getValue()) .append("score", topic.getScore()) - .append("tags", topic.getTags()) .append("text", topic.getCoveredText()); topics.add(topicDoc); } topics.sort((d1, d2) -> Double.compare(d2.getDouble("score"), d1.getDouble("score"))); analysisResults.append("topics", topics); - - // Upload structured Document to MongoDB String speechKey = extractSpeechKeyFromFilename(filename); if (speechKey != null) { Bson filter = Filters.eq("speechKey", speechKey); Bson update = new Document("$set", new Document("analysisResults", analysisResults)); UpdateOneModel updateModel = new UpdateOneModel<>(filter, update); bulkOperations.add(updateModel); - if (bulkOperations.size() >= BATCH_SIZE) { - flushBatch(); + synchronized (bulkOperations) { + if (bulkOperations.size() >= BATCH_SIZE) { + Logger.info("BATCH_SIZE to Upload: " + bulkOperations.size()); + flushBatch(); + } } - processedCount++; - if (processedCount % 5000 == 0) { - Logger.info("Processed speeches: " + processedCount); + int count = processedCount.incrementAndGet(); + if (count % 1000 == 0) { + Logger.info("Processed speeches: " + count); } - } - } catch (Exception e) { e.printStackTrace(); - } - finally { + } finally { if (jCas != null) { jCas.reset(); } @@ -180,23 +186,11 @@ public class XmiExtractor { return baseName.replace("20/", ""); } - private void flushBatch() { + private synchronized void flushBatch() { if (!bulkOperations.isEmpty()) { mongoDBHandler.bulkWriteNlpData(bulkOperations); bulkOperations.clear(); } } - - - /* - public static void main(String[] args) { - try { - XmiExtractor extractor = new XmiExtractor(database); - extractor.extractAndUploadXmiData(); - System.out.println("Processing complete."); - } catch (Exception e) { - e.printStackTrace(); - } - } */ } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/ParlamentarierController.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/ParlamentarierController.java index 8402965..c522aa2 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/ParlamentarierController.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/ParlamentarierController.java @@ -89,6 +89,7 @@ public class ParlamentarierController { String id = ctx.pathParam("id"); Logger.info("getParlamentarierDetails, ID = " + id); + // Alle Details des Abgeordnetes (Vor- und Nachname, Geburts- und Sterbeort, Partei, Vita etc.) ParlamentarierDetails pd = MongoPprUtils.getParlamentarierDetailsByID(id); Map attributes = new HashMap<>(); @@ -101,6 +102,10 @@ public class ParlamentarierController { attributes.put("speechesPlaceholder", emptyList); } + // Foto des Abgeordnetes + String picture = MongoPprUtils.getParlamentarierPictureByID(id); + attributes.put("pic", picture); + ctx.render("parlamentarierDetails.ftl", attributes); } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java index df6e5cb..f6bb0ad 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/rest/SpeechController.java @@ -5,12 +5,16 @@ import io.javalin.openapi.*; import org.texttechnologylab.project.gruppe_05_1.database.MongoPprUtils; import org.texttechnologylab.project.gruppe_05_1.domain.html.HtmlSpeech; import org.texttechnologylab.project.gruppe_05_1.domain.html.ParlamentarierDetails; +import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Token; +import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Topic; import org.texttechnologylab.project.gruppe_05_1.domain.speech.SpeechMetaData; import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speech; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; public class SpeechController { /** @@ -68,6 +72,38 @@ public class SpeechController { HtmlSpeech speech = MongoPprUtils.getSpeechByKey(redeId); attributes.put("s", speech); + // NLP: Topic + if ((speech.getNlp() != null) && (speech.getNlp().getTopics() != null)) { + Map topics = Topic.condenseTopicInformation(speech.getNlp().getTopics()); // Daten "verdichten"... + // ... und ersetzen + speech.getNlp().setTopics( + topics.entrySet().stream() + .map(me -> new Topic(me.getKey(), me.getValue(), null)) + .collect(Collectors.toList())); + } + + // NLP: POS + if (speech.getNlp() != null && speech.getNlp().getTokens() != null) { + List tokens = speech.getNlp().getTokens(); + + Map posCounts = Token.countPOS(tokens); + + List posList = posCounts.entrySet().stream() + .map(entry -> new Token(entry.getKey(), String.valueOf(entry.getValue()), "")) // Lemma remains empty + .collect(Collectors.toList()); + + System.out.println("DEBUG: Sending POS List to NLP - " + posList); + + speech.getNlp().setPosList((List) posList); + + } else { + System.out.println("DEBUG: POS List is EMPTY"); + speech.getNlp().setPosList((List) new ArrayList()); // Ensure it's never null + } + + // TODO: Token wird momentan etwas komisch abgespeichert, da im Attribut text die POS art steht, und in pos die Anzahl dieser POS arten. Umstrukturieren damit keine Verwirrung herrscht + + ctx.render("speech.ftl", attributes); } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java index bd32967..36eed86 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java @@ -23,6 +23,8 @@ import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -423,4 +425,61 @@ public abstract class PPRUtils { return fileNames; } + public static Set checkAndProcessNewProtocols(MongoDBHandler mongoDBHandler) { + Set newProtocols = new HashSet<>(); + int offset = 0; + int limit = 10; + boolean hasMore = true; + Pattern sessionPattern = Pattern.compile("Plenarprotokoll der (\\d+)\\. Sitzung"); + + while (hasMore) { + String queryUrl = "https://www.bundestag.de/ajax/filterlist/de/services/opendata/866354-866354?limit=" + + limit + "&noFilterSet=true&offset=" + offset; + try { + org.jsoup.nodes.Document htmlDoc = Jsoup.connect(queryUrl).get(); + Elements sessionLinks = htmlDoc.select("a.bt-link-dokument"); + if (sessionLinks.isEmpty()) break; + + for (org.jsoup.nodes.Element link : sessionLinks) { + String xmlUrl = link.attr("href"); + String fileName = xmlUrl.substring(xmlUrl.lastIndexOf('/') + 1); // "20212.xml" + // Entferne die Dateiendung + String sessionNumberFull = fileName.replace(".xml", ""); // z.B. "20212" + String sessionNumber; + if (sessionNumberFull.startsWith("20") && sessionNumberFull.length() > 2) { + sessionNumber = sessionNumberFull.substring(2); + } else { + sessionNumber = sessionNumberFull; + } + if (!mongoDBHandler.sessionExists(sessionNumber)) { + try { + org.w3c.dom.Document xmlDoc = downloadAndParseXML(xmlUrl); + newProtocols.add(xmlDoc); + } catch (Exception ex) { + Logger.error("Error processing XML for session " + sessionNumber + ": " + ex.getMessage()); + } + } + + } + + org.jsoup.nodes.Element metaSlider = htmlDoc.selectFirst("div.meta-slider"); + if (metaSlider != null && metaSlider.hasAttr("data-nextoffset")) { + int nextOffset = Integer.parseInt(metaSlider.attr("data-nextoffset")); + if (nextOffset <= offset) { + hasMore = false; + } else { + offset = nextOffset; + } + } else { + hasMore = false; + } + } catch (IOException e) { + Logger.error("Error loading page: " + queryUrl + " : " + e.getMessage()); + break; + } + } + return newProtocols; + } + + } diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java index c4c1ef6..1750a36 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/xml/speeches/SpeechParser.java @@ -40,7 +40,6 @@ public class SpeechParser { } public List parseAllSessions() { - List sessionsEmpty = new ArrayList<>(); List sessions = new ArrayList<>(); this.speeches = new ArrayList<>(); this.agendaItems = new ArrayList<>(); @@ -61,6 +60,26 @@ public class SpeechParser { } + public List parseAllSessions(Set xmlDocuments) { + List sessions = new ArrayList<>(); + this.speeches = new ArrayList<>(); + this.agendaItems = new ArrayList<>(); + Logger.info("All new sessions parsed"); + for (org.w3c.dom.Document xmlDoc : xmlDocuments) { + try { + File tempFile = convertDocumentToFile(xmlDoc); + Session session = parseSessionFile(tempFile); + sessions.add(session); + tempFile.delete(); // Lösche die temporäre Datei nach der Verarbeitung + } catch (Exception e) { + Logger.error("Error parsing XML document."); + Logger.error(e.getMessage()); + } + } + return sessions; + + } + private Session parseSessionFile(File file) throws Exception { //file = removeDoctypeAnnotation(file.getAbsolutePath()); diff --git a/src/main/resources/members_of_parliament_image_crawler.py b/src/main/resources/members_of_parliament_image_crawler.py index 9bb2340..2759e4c 100644 --- a/src/main/resources/members_of_parliament_image_crawler.py +++ b/src/main/resources/members_of_parliament_image_crawler.py @@ -763,7 +763,7 @@ members = [ ] # Base URL for querying (with placeholders for last name and first name) -base_url = "https://bilddatenbank.bundestag.de/search/picture-result?query={0}%2C+{1}&filterQuery%5Bereignis%5D%5B%5D=Portr%C3%A4t%2FPortrait&sortVal=2" +base_url = "https://bilddatenbank.bundestag.de/search/picture-result?query={0}+{1}&sortVal=2" #base_url = "https://bilddatenbank.bundestag.de/search/picture-result?filterQuery%5Bname%5D%5B%5D={0}l%2C+{1}&filterQuery%5Bereignis%5D%5B%5D=Portr%C3%A4t%2FPortrait&sortVal=2" def fetch_image(lastname, firstname): diff --git a/src/main/resources/templates/parlamentarierDetails.ftl b/src/main/resources/templates/parlamentarierDetails.ftl index bb8df72..63adf8d 100644 --- a/src/main/resources/templates/parlamentarierDetails.ftl +++ b/src/main/resources/templates/parlamentarierDetails.ftl @@ -17,10 +17,9 @@

${p.vorname} ${p.nachname} (${p.partei})

-

<#if pic??> - Foto von ${p.vorname}  ${p.nachname} (${p.partei}) + Foto von ${p.vorname}  ${p.nachname} (${p.partei}) <#else>

(kein Foto verfügbar)

diff --git a/src/main/resources/templates/speech.ftl b/src/main/resources/templates/speech.ftl index a0ef5a3..7a40243 100644 --- a/src/main/resources/templates/speech.ftl +++ b/src/main/resources/templates/speech.ftl @@ -23,11 +23,19 @@

Rede ${s.speechKey}

-
- <#list s.content as c> - <#include "speechContent.ftl"> - -
+ <#list s.content as c> + <#include "speechContent.ftl"> + + +

+ <#if s.nlp??> +

NLP Information

+ <#assign nlp = "${s.nlp}"> + <#include "nlp.ftl"> + <#else> +

Keine NLP Information verfügbar für diese Rede

+ +

diff --git a/src/main/resources/templates/topicsBubbleChart.ftl b/src/main/resources/templates/topicsBubbleChart.ftl index 4dba392..bef23fe 100644 --- a/src/main/resources/templates/topicsBubbleChart.ftl +++ b/src/main/resources/templates/topicsBubbleChart.ftl @@ -48,4 +48,3 @@ }); -