merge conflict reslove
This commit is contained in:
parent
98d1d80fda
commit
e6ef7adc6c
25 changed files with 1266 additions and 88 deletions
|
@ -8,10 +8,18 @@ import org.texttechnologylab.project.gruppe_05_1.rest.RESTHandler;
|
|||
import org.texttechnologylab.project.gruppe_05_1.util.Logger;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
|
||||
import org.texttechnologylab.project.gruppe_05_1.xml.FileObjectFactory;
|
||||
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.SpeechParser;
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static java.lang.Boolean.FALSE;
|
||||
import static java.lang.Boolean.TRUE;
|
||||
import static org.texttechnologylab.project.gruppe_05_1.util.PPRUtils.checkAndProcessNewProtocols;
|
||||
|
||||
public class Main {
|
||||
public static boolean UPLOAD_MEMBER_PHOTOS;
|
||||
|
@ -116,13 +124,33 @@ public class Main {
|
|||
Logger.pink("Uploading Member Photos to DB...");
|
||||
mongoDBHandler.uploadMemberPhotos();
|
||||
}
|
||||
mongoDBHandler.close();
|
||||
try {
|
||||
NlpUtils.runRemoteDriver();
|
||||
} catch (Exception e) {
|
||||
Logger.error("Error while running NLP remote driver");
|
||||
Logger.error(e.getMessage());
|
||||
}
|
||||
NlpUtils.runRemoteDriver();
|
||||
/*ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
|
||||
scheduler.scheduleAtFixedRate(() -> {
|
||||
try {
|
||||
NlpUtils.runRemoteDriver();
|
||||
} catch (Exception e) {
|
||||
Logger.error("Error while running NLP remote driver");
|
||||
Logger.error(e.getMessage());
|
||||
}
|
||||
try {
|
||||
Logger.info("Starte Aktualisierung der Protokolle...");
|
||||
Set<Document> newProtocols = checkAndProcessNewProtocols(mongoDBHandler);
|
||||
Logger.info("Neue Protokolle gefunden: " + newProtocols.size());
|
||||
if (newProtocols.isEmpty()) {
|
||||
Logger.info("Keine neuen Protokolle gefunden, Upload wird übersprungen.");
|
||||
} else {
|
||||
SpeechParser speechParser = new SpeechParser();
|
||||
mongoDBHandler.insertSessions(speechParser.parseAllSessions(newProtocols));
|
||||
mongoDBHandler.insertAgendaItems(speechParser.getAgendaItems());
|
||||
mongoDBHandler.insertSpeeches(speechParser.getSpeeches());
|
||||
Logger.info("Neuer Protokolle uploaded: " + newProtocols.size());
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
Logger.error("Fehler bei der Protokollaktualisierung: " + ex.getMessage());
|
||||
}
|
||||
}, 0, 10, TimeUnit.MINUTES);*/
|
||||
|
||||
RESTHandler restHandler = new RESTHandler();
|
||||
restHandler.startJavalin();
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package org.texttechnologylab.project.gruppe_05_1.database;
|
|||
import com.mongodb.MongoClientSettings;
|
||||
import com.mongodb.MongoCredential;
|
||||
import com.mongodb.ServerAddress;
|
||||
import com.mongodb.WriteConcern;
|
||||
import com.mongodb.bulk.BulkWriteResult;
|
||||
import com.mongodb.client.MongoClient;
|
||||
import com.mongodb.client.MongoClients;
|
||||
|
@ -691,7 +692,9 @@ public class MongoDBHandler {
|
|||
|
||||
public void bulkWriteNlpData(List<WriteModel<Document>> bulkOperations) {
|
||||
if (!bulkOperations.isEmpty()) {
|
||||
BulkWriteResult result = speechesCollection.bulkWrite(bulkOperations);
|
||||
BulkWriteOptions options = new BulkWriteOptions().ordered(false);
|
||||
// Optional: Setze einen weniger strengen Write Concern
|
||||
BulkWriteResult result = speechesCollection.bulkWrite(bulkOperations, options);
|
||||
int modifiedCount = result.getModifiedCount();
|
||||
int matchedCount = result.getMatchedCount();
|
||||
int upsertCount = result.getUpserts().size();
|
||||
|
@ -764,6 +767,12 @@ public class MongoDBHandler {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean sessionExists(String sessionNumber) {
|
||||
Document filter = new Document("sessionNumber", sessionNumber);
|
||||
long count = sessionsCollection.countDocuments(filter);
|
||||
return count > 0;
|
||||
}
|
||||
|
||||
public String getMemberPhoto(String memberId) {
|
||||
Document photoDocument = memberPhotoCollection.find(eq("memberId", memberId)).first();
|
||||
if (photoDocument == null) {
|
||||
|
|
|
@ -8,6 +8,7 @@ import org.texttechnologylab.project.gruppe_05_1.database.domainimp.speeches.Spe
|
|||
import org.texttechnologylab.project.gruppe_05_1.domain.html.HtmlSpeech;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.Parlamentarier;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.ParlamentarierDetails;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.*;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.speaker.Membership;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.speech.SpeechMetaData;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.GeneralUtils;
|
||||
|
@ -162,6 +163,24 @@ public class MongoPprUtils {
|
|||
return p;
|
||||
}
|
||||
|
||||
|
||||
// - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
|
||||
// Parlamentarier - Picture
|
||||
|
||||
/**
|
||||
*
|
||||
* @param id : ID des Parlamentariers
|
||||
* @return Das Foto (als Base64-encoded String)
|
||||
*/
|
||||
public static String getParlamentarierPictureByID(String id) {
|
||||
Document doc = MongoDBHandler.findFirstDocumentInCollection(getPicturesCollection(), "memberId", id);
|
||||
if (doc == null) {
|
||||
return null;
|
||||
} else return doc.getString("base64");
|
||||
}
|
||||
|
||||
// - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
|
||||
|
@ -362,7 +381,11 @@ public class MongoPprUtils {
|
|||
|
||||
// Sortiere nach Datum, absteigend
|
||||
speechMetaDataList.sort((md1, md2) -> {
|
||||
return md2.getDateTime().compareTo(md1.getDateTime());
|
||||
try {
|
||||
return md2.getDateTime().compareTo(md1.getDateTime());
|
||||
} catch (NullPointerException e) {
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
return speechMetaDataList;
|
||||
|
@ -399,6 +422,14 @@ public class MongoPprUtils {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Liefert die Rede-Informationen für die Anzeige einer Rede:
|
||||
* - die Rede-ID
|
||||
* - Name und Fraktion des Redners
|
||||
* - Die Inhalte der Rede
|
||||
* @param key: Rede ID
|
||||
* @return
|
||||
*/
|
||||
public static HtmlSpeech getSpeechByKey(String key) {
|
||||
Document filter = new Document("speechKey", key);
|
||||
Document speechDoc = getSpeechCollection().find(filter).first();
|
||||
|
|
|
@ -2,6 +2,9 @@ package org.texttechnologylab.project.gruppe_05_1.domain.html;
|
|||
|
||||
import org.bson.Document;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.NlpInfo;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Token;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Topic;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -13,6 +16,7 @@ public class HtmlSpeech {
|
|||
String speakerName;
|
||||
String fraction;
|
||||
List<SpeechContent> content = new ArrayList<>();
|
||||
NlpInfo nlp = null;
|
||||
|
||||
public HtmlSpeech() {
|
||||
}
|
||||
|
@ -30,6 +34,33 @@ public class HtmlSpeech {
|
|||
addContent(new SpeechContent(contentDoc));
|
||||
}
|
||||
}
|
||||
|
||||
Document nlpDoc = (Document) doc.get("analysisResults");
|
||||
nlp = readNlpInfo(nlpDoc);
|
||||
}
|
||||
|
||||
private NlpInfo readNlpInfo(Document nlpDoc) {
|
||||
if (nlpDoc == null) return null;
|
||||
NlpInfo nlp = new NlpInfo();
|
||||
|
||||
// TODO: HERE
|
||||
List<Document> tokensDocs = nlpDoc.get("tokens", MongoDBHandler.DOC_LIST_CLASS);
|
||||
nlp.setTokens(Token.readTokensFromMongo(tokensDocs));
|
||||
|
||||
List<Document> sentencesDocs = nlpDoc.get("sentences", MongoDBHandler.DOC_LIST_CLASS);
|
||||
|
||||
List<Document> dependenciesDocs = nlpDoc.get("dependencies", MongoDBHandler.DOC_LIST_CLASS);
|
||||
|
||||
List<Document> namedEntitiesDocs = nlpDoc.get("namedEntities", MongoDBHandler.DOC_LIST_CLASS);
|
||||
|
||||
List<Document> sentimentsDocs = nlpDoc.get("sentiments", MongoDBHandler.DOC_LIST_CLASS);
|
||||
|
||||
List<Document> topicsDocs = nlpDoc.get("topics", MongoDBHandler.DOC_LIST_CLASS);
|
||||
nlp.setTopics(Topic.readTopicsFromMongo(topicsDocs));
|
||||
|
||||
// TODO: Video
|
||||
|
||||
return nlp;
|
||||
}
|
||||
|
||||
public String getSpeechKey() {
|
||||
|
@ -68,16 +99,26 @@ public class HtmlSpeech {
|
|||
content.add(contentLine);
|
||||
}
|
||||
|
||||
public NlpInfo getNlp() {
|
||||
return nlp;
|
||||
}
|
||||
|
||||
public void setNlp(NlpInfo nlp) {
|
||||
this.nlp = nlp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof HtmlSpeech that)) return false;
|
||||
return Objects.equals(speechKey, that.speechKey) && Objects.equals(speakerName, that.speakerName) && Objects.equals(fraction, that.fraction) && Objects.equals(content, that.content);
|
||||
return Objects.equals(speechKey, that.speechKey) && Objects.equals(speakerName, that.speakerName)
|
||||
&& Objects.equals(fraction, that.fraction) && Objects.equals(content, that.content)
|
||||
&& Objects.equals(nlp, that.nlp);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(speechKey, speakerName, fraction, content);
|
||||
return Objects.hash(speechKey, speakerName, fraction, content, nlp);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -87,6 +128,7 @@ public class HtmlSpeech {
|
|||
.add("speakerName='" + speakerName + "'")
|
||||
.add("fraction='" + fraction + "'")
|
||||
.add("content=" + content)
|
||||
.add("nlp=" + nlp)
|
||||
.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class AudioToken {
|
||||
|
||||
private int begin;
|
||||
private int end;
|
||||
private double timeStart;
|
||||
private double timeEnd;
|
||||
private String value;
|
||||
|
||||
public AudioToken() {
|
||||
}
|
||||
|
||||
public AudioToken(int begin, int end, double timeStart, double timeEnd, String value) {
|
||||
this.begin = begin;
|
||||
this.end = end;
|
||||
this.timeStart = timeStart;
|
||||
this.timeEnd = timeEnd;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public int getBegin() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
public void setBegin(int begin) {
|
||||
this.begin = begin;
|
||||
}
|
||||
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public void setEnd(int end) {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public double getTimeStart() {
|
||||
return timeStart;
|
||||
}
|
||||
|
||||
public void setTimeStart(double timeStart) {
|
||||
this.timeStart = timeStart;
|
||||
}
|
||||
|
||||
public double getTimeEnd() {
|
||||
return timeEnd;
|
||||
}
|
||||
|
||||
public void setTimeEnd(double timeEnd) {
|
||||
this.timeEnd = timeEnd;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof AudioToken that)) return false;
|
||||
return begin == that.begin && end == that.end && Double.compare(timeStart, that.timeStart) == 0 && Double.compare(timeEnd, that.timeEnd) == 0 && Objects.equals(value, that.value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(begin, end, timeStart, timeEnd, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", AudioToken.class.getSimpleName() + "[", "]")
|
||||
.add("begin=" + begin)
|
||||
.add("end=" + end)
|
||||
.add("timeStart=" + timeStart)
|
||||
.add("timeEnd=" + timeEnd)
|
||||
.add("value='" + value + "'")
|
||||
.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class Dependency {
|
||||
String type;
|
||||
String governor;
|
||||
String dependent;
|
||||
|
||||
public Dependency() {
|
||||
}
|
||||
|
||||
public Dependency(String type, String governor, String dependent) {
|
||||
this.type = type;
|
||||
this.governor = governor;
|
||||
this.dependent = dependent;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getGovernor() {
|
||||
return governor;
|
||||
}
|
||||
|
||||
public void setGovernor(String governor) {
|
||||
this.governor = governor;
|
||||
}
|
||||
|
||||
public String getDependent() {
|
||||
return dependent;
|
||||
}
|
||||
|
||||
public void setDependent(String dependent) {
|
||||
this.dependent = dependent;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Dependency that)) return false;
|
||||
return Objects.equals(type, that.type) && Objects.equals(governor, that.governor) && Objects.equals(dependent, that.dependent);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(type, governor, dependent);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", Dependency.class.getSimpleName() + "[", "]")
|
||||
.add("type='" + type + "'")
|
||||
.add("governor='" + governor + "'")
|
||||
.add("dependent='" + dependent + "'")
|
||||
.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class NamedEntity {
|
||||
String type; // PER, LOC etc.
|
||||
// int begin; // TODO: momentan nicht in MongoDB
|
||||
// int end; // TODO: momentan nicht in MongoDB
|
||||
String text;
|
||||
|
||||
public NamedEntity() {
|
||||
}
|
||||
|
||||
public NamedEntity(String type, String text) {
|
||||
this.type = type;
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof NamedEntity that)) return false;
|
||||
return Objects.equals(type, that.type) && Objects.equals(text, that.text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(type, text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", NamedEntity.class.getSimpleName() + "[", "]")
|
||||
.add("type='" + type + "'")
|
||||
.add("text='" + text + "'")
|
||||
.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class NlpInfo {
|
||||
List<Token> tokens;
|
||||
List<Sentence> sentences;
|
||||
List<Dependency> dependencies;
|
||||
List<NamedEntity> namedEntities;
|
||||
Sentiment overallSentiment; // Sentiment for the whole text ; kann null sein!
|
||||
List<Sentiment> sentiments; // sentiments for the respective sentences (eine Liste von 0..n Elementen)
|
||||
List<Topic> topics;
|
||||
List<Pos> posList;
|
||||
|
||||
VideoInformation videoInformation;
|
||||
|
||||
public List<Token> getTokens() {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
public void setTokens(List<Token> tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
public List<Sentence> getSentences() {
|
||||
return sentences;
|
||||
}
|
||||
|
||||
public void setSentences(List<Sentence> sentences) {
|
||||
this.sentences = sentences;
|
||||
}
|
||||
|
||||
public List<Dependency> getDependencies() {
|
||||
return dependencies;
|
||||
}
|
||||
|
||||
public void setDependencies(List<Dependency> dependencies) {
|
||||
this.dependencies = dependencies;
|
||||
}
|
||||
|
||||
public List<NamedEntity> getNamedEntities() {
|
||||
return namedEntities;
|
||||
}
|
||||
|
||||
public void setNamedEntities(List<NamedEntity> namedEntities) {
|
||||
this.namedEntities = namedEntities;
|
||||
}
|
||||
|
||||
public Sentiment getOverallSentiment() {
|
||||
return overallSentiment;
|
||||
}
|
||||
|
||||
public void setOverallSentiment(Sentiment overallSentiment) {
|
||||
this.overallSentiment = overallSentiment;
|
||||
}
|
||||
|
||||
public List<Sentiment> getSentiments() {
|
||||
return sentiments;
|
||||
}
|
||||
|
||||
public void setSentiments(List<Sentiment> sentiments) {
|
||||
this.sentiments = sentiments;
|
||||
}
|
||||
|
||||
public List<Topic> getTopics() {
|
||||
return topics;
|
||||
}
|
||||
|
||||
public void setTopics(List<Topic> topics) {
|
||||
this.topics = topics;
|
||||
}
|
||||
|
||||
public List<Pos> getPosList() {
|
||||
return posList;
|
||||
}
|
||||
|
||||
public void setPosList(List<Pos> posList) {
|
||||
this.posList = posList;
|
||||
}
|
||||
|
||||
public VideoInformation getVideoInformation() {
|
||||
return videoInformation;
|
||||
}
|
||||
|
||||
public void setVideoInformation(VideoInformation videoInformation) {
|
||||
this.videoInformation = videoInformation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof NlpInfo nlpInfo)) return false;
|
||||
return Objects.equals(tokens, nlpInfo.tokens) && Objects.equals(sentences, nlpInfo.sentences)
|
||||
&& Objects.equals(dependencies, nlpInfo.dependencies) && Objects.equals(namedEntities, nlpInfo.namedEntities)
|
||||
&& Objects.equals(overallSentiment, nlpInfo.overallSentiment) && Objects.equals(sentiments, nlpInfo.sentiments)
|
||||
&& Objects.equals(topics, nlpInfo.topics) && Objects.equals(posList, nlpInfo.posList)
|
||||
&& Objects.equals(videoInformation, nlpInfo.videoInformation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(tokens, sentences, dependencies, namedEntities, overallSentiment, sentiments, topics, posList, videoInformation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", NlpInfo.class.getSimpleName() + "[", "]")
|
||||
.add("tokens=" + tokens)
|
||||
.add("sentences=" + sentences)
|
||||
.add("dependencies=" + dependencies)
|
||||
.add("namedEntities=" + namedEntities)
|
||||
.add("overallSentiment=" + overallSentiment)
|
||||
.add("sentiments=" + sentiments)
|
||||
.add("topics=" + topics)
|
||||
.add("posList=" + posList)
|
||||
.add("videoInformation=" + videoInformation)
|
||||
.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class Pos {
|
||||
String posValue; // ART, NN...
|
||||
String coarseValue; // PROPN...
|
||||
int begin;
|
||||
int end;
|
||||
String coveredText;
|
||||
|
||||
// Am Dateiende stehen beispiele für mögliche Werte
|
||||
|
||||
|
||||
public Pos() {
|
||||
}
|
||||
|
||||
public Pos(String posValue, String coarseValue, int begin, int end, String coveredText) {
|
||||
this.posValue = posValue;
|
||||
this.coarseValue = coarseValue;
|
||||
this.begin = begin;
|
||||
this.end = end;
|
||||
this.coveredText = coveredText;
|
||||
}
|
||||
|
||||
public String getPosValue() {
|
||||
return posValue;
|
||||
}
|
||||
|
||||
public void setPosValue(String posValue) {
|
||||
this.posValue = posValue;
|
||||
}
|
||||
|
||||
public String getCoarseValue() {
|
||||
return coarseValue;
|
||||
}
|
||||
|
||||
public void setCoarseValue(String coarseValue) {
|
||||
this.coarseValue = coarseValue;
|
||||
}
|
||||
|
||||
public int getBegin() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
public void setBegin(int begin) {
|
||||
this.begin = begin;
|
||||
}
|
||||
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public void setEnd(int end) {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public String getCoveredText() {
|
||||
return coveredText;
|
||||
}
|
||||
|
||||
public void setCoveredText(String coveredText) {
|
||||
this.coveredText = coveredText;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Pos pos)) return false;
|
||||
return begin == pos.begin && end == pos.end && Objects.equals(posValue, pos.posValue) && Objects.equals(coarseValue, pos.coarseValue) && Objects.equals(coveredText, pos.coveredText);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(posValue, coarseValue, begin, end, coveredText);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", Pos.class.getSimpleName() + "[", "]")
|
||||
.add("posValue='" + posValue + "'")
|
||||
.add("coarseValue='" + coarseValue + "'")
|
||||
.add("begin=" + begin)
|
||||
.add("end=" + end)
|
||||
.add("coveredText='" + coveredText + "'")
|
||||
.toString();
|
||||
}
|
||||
|
||||
/* Beispielswerte:
|
||||
|
||||
MyPos{posValue='ART', coarseValue='DET', begin=0, end=3, coveredText='Die'},
|
||||
MyPos{posValue='NN', coarseValue='NOUN', begin=4, end=8, coveredText='Idee'},
|
||||
MyPos{posValue='APPR', coarseValue='ADP', begin=9, end=12, coveredText='von'},
|
||||
MyPos{posValue='NE', coarseValue='PROPN', begin=13, end=16, coveredText='Joe'},
|
||||
MyPos{posValue='NN', coarseValue='PROPN', begin=17, end=22, coveredText='Biden'},
|
||||
MyPos{posValue='APPR', coarseValue='ADP', begin=23, end=26, coveredText='aus'},
|
||||
MyPos{posValue='NE', coarseValue='PROPN', begin=27, end=36, coveredText='Bucharest'},
|
||||
MyPos{posValue='$,', coarseValue='PUNCT', begin=36, end=37, coveredText=','},
|
||||
MyPos{posValue='NE', coarseValue='PROPN', begin=38, end=46, coveredText='Rumänien'},
|
||||
MyPos{posValue='$,', coarseValue='PUNCT', begin=46, end=47, coveredText=','},
|
||||
MyPos{posValue='VVFIN', coarseValue='VERB', begin=48, end=53, coveredText='finde'},
|
||||
MyPos{posValue='PPER', coarseValue='PRON', begin=54, end=57, coveredText='ich'},
|
||||
MyPos{posValue='ADJD', coarseValue='ADV', begin=58, end=61, coveredText='gut'},
|
||||
MyPos{posValue='$.', coarseValue='PUNCT', begin=61, end=62, coveredText='.'},
|
||||
MyPos{posValue='ART', coarseValue='DET', begin=63, end=66, coveredText='Den'},
|
||||
MyPos{posValue='NN', coarseValue='NOUN', begin=67, end=76, coveredText='Vorschlag'},
|
||||
MyPos{posValue='APPR', coarseValue='ADP', begin=77, end=80, coveredText='von'},
|
||||
MyPos{posValue='NE', coarseValue='PROPN', begin=81, end=87, coveredText='Donald'},
|
||||
MyPos{posValue='NE', coarseValue='PROPN', begin=88, end=93, coveredText='Trump'},
|
||||
MyPos{posValue='APPR', coarseValue='ADP', begin=94, end=97, coveredText='aus'},
|
||||
MyPos{posValue='NE', coarseValue='PROPN', begin=98, end=108, coveredText='Frankreich'},
|
||||
MyPos{posValue='VVFIN', coarseValue='VERB', begin=109, end=114, coveredText='finde'},
|
||||
MyPos{posValue='PPER', coarseValue='PRON', begin=115, end=118, coveredText='ich'},
|
||||
MyPos{posValue='ADV', coarseValue='ADV', begin=119, end=126, coveredText='weniger'},
|
||||
MyPos{posValue='ADJD', coarseValue='ADV', begin=127, end=130, coveredText='gut'},
|
||||
MyPos{posValue='$.', coarseValue='PUNCT', begin=130, end=131, coveredText='.'}],
|
||||
*/
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class Sentence {
|
||||
// int begin; // TODO: momentan nicht in MongoDB
|
||||
// int end; // TODO: momentan nicht in MongoDB
|
||||
String text;
|
||||
|
||||
public Sentence() {
|
||||
}
|
||||
|
||||
public Sentence(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Sentence sentence)) return false;
|
||||
return Objects.equals(text, sentence.text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", Sentence.class.getSimpleName() + "[", "]")
|
||||
.add("text='" + text + "'")
|
||||
.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class Sentiment {
|
||||
int begin;
|
||||
int end;
|
||||
double sentiment; // overall sentiment
|
||||
double negative;
|
||||
double neutral;
|
||||
double positive;
|
||||
|
||||
public Sentiment() {
|
||||
}
|
||||
|
||||
public Sentiment(int begin, int end, double sentiment, double negative, double neutral, double positive) {
|
||||
this.begin = begin;
|
||||
this.end = end;
|
||||
this.sentiment = sentiment;
|
||||
this.negative = negative;
|
||||
this.neutral = neutral;
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
public int getBegin() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
public void setBegin(int begin) {
|
||||
this.begin = begin;
|
||||
}
|
||||
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public void setEnd(int end) {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public double getSentiment() {
|
||||
return sentiment;
|
||||
}
|
||||
|
||||
public void setSentiment(double sentiment) {
|
||||
this.sentiment = sentiment;
|
||||
}
|
||||
|
||||
public double getNegative() {
|
||||
return negative;
|
||||
}
|
||||
|
||||
public void setNegative(double negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
|
||||
public double getNeutral() {
|
||||
return neutral;
|
||||
}
|
||||
|
||||
public void setNeutral(double neutral) {
|
||||
this.neutral = neutral;
|
||||
}
|
||||
|
||||
public double getPositive() {
|
||||
return positive;
|
||||
}
|
||||
|
||||
public void setPositive(double positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Sentiment sentiment1)) return false;
|
||||
return begin == sentiment1.begin && end == sentiment1.end && Double.compare(sentiment, sentiment1.sentiment) == 0 && Double.compare(negative, sentiment1.negative) == 0 && Double.compare(neutral, sentiment1.neutral) == 0 && Double.compare(positive, sentiment1.positive) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(begin, end, sentiment, negative, neutral, positive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", Sentiment.class.getSimpleName() + "[", "]")
|
||||
.add("begin=" + begin)
|
||||
.add("end=" + end)
|
||||
.add("sentiment=" + sentiment)
|
||||
.add("negative=" + negative)
|
||||
.add("neutral=" + neutral)
|
||||
.add("positive=" + positive)
|
||||
.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import org.bson.Document;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class Token {
|
||||
String text;
|
||||
String pos;
|
||||
String lemma;
|
||||
|
||||
public Token() {
|
||||
}
|
||||
|
||||
public Token(String text, String pos, String lemma) {
|
||||
this.text = text;
|
||||
this.pos = pos;
|
||||
this.lemma = lemma;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public String getPos() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void setPos(String pos) {
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public String getLemma() {
|
||||
return lemma;
|
||||
}
|
||||
|
||||
public void setLemma(String lemma) {
|
||||
this.lemma = lemma;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Token token)) return false;
|
||||
return Objects.equals(text, token.text) && Objects.equals(pos, token.pos) && Objects.equals(lemma, token.lemma);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(text, pos, lemma);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", Token.class.getSimpleName() + "[", "]")
|
||||
.add("text='" + text + "'")
|
||||
.add("pos='" + pos + "'")
|
||||
.add("lemma='" + lemma + "'")
|
||||
.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Die Token-Dokumente (Speech --> analysisResults --> token) aus der MongoDB lesen
|
||||
* @param tokenDocs Eine Liste von Mongo-Dokumenten
|
||||
* @return Eine Liste der Token
|
||||
*/
|
||||
public static List<Token> readTokensFromMongo(List<Document> tokenDocs) {
|
||||
List<Token> tokens = new ArrayList<>();
|
||||
for (Document doc : tokenDocs) {
|
||||
tokens.add(new Token(doc.getString("text"),
|
||||
doc.getString("pos"),
|
||||
doc.getString("lemma")
|
||||
));
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Zählt alle verschiedenen POS Vorkommen auf
|
||||
* @param tokenList
|
||||
* @return Jede POS art mit ihrer Anzahl an Vorkommen
|
||||
*/
|
||||
public static Map<String, Integer> countPOS(List<Token> tokenList) {
|
||||
Map<String, Integer> posCounts = new HashMap<>();
|
||||
|
||||
for (Token token : tokenList) {
|
||||
posCounts.put(token.getPos(), posCounts.getOrDefault(token.getPos(), 0) + 1);
|
||||
}
|
||||
|
||||
return posCounts;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import org.bson.Document;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class Topic {
|
||||
String topic;
|
||||
Double score;
|
||||
String text;
|
||||
|
||||
public Topic() {
|
||||
}
|
||||
|
||||
public Topic(String topic, Double score, String text) {
|
||||
this.topic = topic;
|
||||
this.score = score;
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public String getTopic() {
|
||||
return topic;
|
||||
}
|
||||
|
||||
public void setTopic(String topic) {
|
||||
this.topic = topic;
|
||||
}
|
||||
|
||||
public Double getScore() {
|
||||
return score;
|
||||
}
|
||||
|
||||
public void setScore(Double score) {
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof Topic topic1)) return false;
|
||||
return Double.compare(score, topic1.score) == 0 && Objects.equals(topic, topic1.topic) && Objects.equals(text, topic1.text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(topic, score, text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", Topic.class.getSimpleName() + "[", "]")
|
||||
.add("topic='" + topic + "'")
|
||||
.add("score=" + score)
|
||||
.add("text='" + text + "'")
|
||||
.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Die Topics-Dokumente (Speech --> analysisResults --> topics) aus der MongoDB lesen
|
||||
* @param topicsDocs Eine Liste von Mongo-Dokumenten
|
||||
* @return Eine Liste der Topics
|
||||
*/
|
||||
public static List<Topic> readTopicsFromMongo(List<Document> topicsDocs) {
|
||||
List<Topic> topics = new ArrayList<>();
|
||||
for (Document doc : topicsDocs) {
|
||||
topics.add(new Topic(doc.getString("topic"),
|
||||
doc.getDouble("score"),
|
||||
doc.getString("text")
|
||||
));
|
||||
}
|
||||
return topics;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Topic-Informationen "verdichten":
|
||||
* Ausgangssituation: eine Liste mit mehreren Topics. Ein Topic kann in dieser Liste mehrfach vorkommen.
|
||||
* Man will wissen, welche Score hat jeden Topic. Hier werden die Werte der jeweiligen Topics summiert.
|
||||
*
|
||||
* @param topicsList
|
||||
* @return Map<String, Double>
|
||||
*/
|
||||
public static Map<String, Double> condenseTopicInformation(List<Topic> topicsList) {
|
||||
Map<String, Double> condensedTopicInfo = new HashMap<>();
|
||||
|
||||
for (Topic t : topicsList) {
|
||||
Double oldValue = condensedTopicInfo.get(t.getTopic());
|
||||
if (oldValue != null) {
|
||||
condensedTopicInfo.replace(t.getTopic(), oldValue + t.getScore());
|
||||
} else {
|
||||
condensedTopicInfo.put(t.getTopic(), t.getScore());
|
||||
}
|
||||
}
|
||||
|
||||
return condensedTopicInfo;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class VideoInformation {
|
||||
List<AudioToken> audioTokens;
|
||||
|
||||
public VideoInformation() {
|
||||
}
|
||||
|
||||
public VideoInformation(List<AudioToken> audioTokens) {
|
||||
this.audioTokens = audioTokens;
|
||||
}
|
||||
|
||||
public List<AudioToken> getAudioTokens() {
|
||||
return audioTokens;
|
||||
}
|
||||
|
||||
public void setAudioTokens(List<AudioToken> audioTokens) {
|
||||
this.audioTokens = audioTokens;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof VideoInformation that)) return false;
|
||||
return Objects.equals(audioTokens, that.audioTokens);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(audioTokens);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", VideoInformation.class.getSimpleName() + "[", "]")
|
||||
.add("audioTokens=" + audioTokens)
|
||||
.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.domain.nlp.html;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Diese Klasse ordnet das entspreche Sentiment zu einem Satz zu.
|
||||
* Sie ist ein Datencontainer für die Darstellung über FreeMarker
|
||||
*/
|
||||
public class SentimentOfSentence {
|
||||
int begin;
|
||||
int end;
|
||||
String text;
|
||||
// RGBA Werte für die Darstellung
|
||||
float sentiment; // overall sentiment --> wird für den alpha (Opaque) Wert verwendet --> 0..1
|
||||
int negative; // red --> 0..255
|
||||
int neutral; // 0..255, wird momentan nicht benutzt
|
||||
int positive; // green --> 0..255
|
||||
|
||||
public SentimentOfSentence() {}
|
||||
|
||||
public SentimentOfSentence(int begin, int end, String text, float sentiment, int negative, int neutral, int positive) {
|
||||
this.begin = begin;
|
||||
this.end = end;
|
||||
this.text = text;
|
||||
this.sentiment = sentiment;
|
||||
this.negative = negative;
|
||||
this.neutral = neutral;
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
public int getBegin() {
|
||||
return begin;
|
||||
}
|
||||
|
||||
public void setBegin(int begin) {
|
||||
this.begin = begin;
|
||||
}
|
||||
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public void setEnd(int end) {
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public float getSentiment() {
|
||||
return sentiment;
|
||||
}
|
||||
|
||||
public void setSentiment(float sentiment) {
|
||||
this.sentiment = sentiment;
|
||||
}
|
||||
|
||||
public int getNegative() {
|
||||
return negative;
|
||||
}
|
||||
|
||||
public void setNegative(int negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
|
||||
public int getNeutral() {
|
||||
return neutral;
|
||||
}
|
||||
|
||||
public void setNeutral(int neutral) {
|
||||
this.neutral = neutral;
|
||||
}
|
||||
|
||||
public int getPositive() {
|
||||
return positive;
|
||||
}
|
||||
|
||||
public void setPositive(int positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof SentimentOfSentence that)) return false;
|
||||
return begin == that.begin && end == that.end && Double.compare(sentiment, that.sentiment) == 0 && Double.compare(negative, that.negative) == 0 && Double.compare(neutral, that.neutral) == 0 && Double.compare(positive, that.positive) == 0 && Objects.equals(text, that.text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(begin, end, text, sentiment, negative, neutral, positive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder("SentimentOfSentence{");
|
||||
sb.append("begin=").append(begin);
|
||||
sb.append(", end=").append(end);
|
||||
sb.append(", text='").append(text).append('\'');
|
||||
sb.append(", sentiment=").append(sentiment);
|
||||
sb.append(", negative=").append(negative);
|
||||
sb.append(", neutral=").append(neutral);
|
||||
sb.append(", positive=").append(positive);
|
||||
sb.append('}');
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -360,6 +360,8 @@ public class NlpUtils {
|
|||
bulkOperations.add(new UpdateOneModel<>(updateFilter, update));
|
||||
}
|
||||
if (!bulkOperations.isEmpty()) {
|
||||
System.out.println("Processing of " + bulkOperations.size() + " documents finished");
|
||||
System.out.println("uploading...");
|
||||
mongoDBHandler.bulkWriteNlpData(bulkOperations);
|
||||
Logger.debug("Bulk write completed for " + bulkOperations.size() + " documents.");
|
||||
mongoDBHandler.close();
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.nlp;
|
||||
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import com.mongodb.client.model.Filters;
|
||||
import com.mongodb.client.model.UpdateOneModel;
|
||||
import com.mongodb.client.model.WriteModel;
|
||||
import com.mongodb.client.result.UpdateResult;
|
||||
import org.apache.uima.fit.util.JCasUtil;
|
||||
import org.bson.Document;
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.zip.*;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -18,12 +19,7 @@ import org.bson.conversions.Bson;
|
|||
import org.apache.uima.fit.factory.JCasFactory;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.cas.impl.XmiCasDeserializer;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
|
||||
import org.hucompute.textimager.uima.type.category.CategoryCoveredTagged;
|
||||
import org.texttechnologylab.project.gruppe_05_1.util.Logger;
|
||||
|
||||
import static org.texttechnologylab.project.gruppe_05_1.Main.JCAS_SPEECHES_TYPESYSTEM_DIR;
|
||||
|
@ -31,29 +27,45 @@ import static org.texttechnologylab.project.gruppe_05_1.Main.JCAS_SPEECHES_TYPES
|
|||
|
||||
public class XmiExtractor {
|
||||
|
||||
private List<WriteModel<Document>> bulkOperations;
|
||||
private MongoDBHandler mongoDBHandler;
|
||||
private final List<WriteModel<Document>> bulkOperations = Collections.synchronizedList(new ArrayList<>());
|
||||
private final MongoDBHandler mongoDBHandler;
|
||||
private static final int BATCH_SIZE = 1000;
|
||||
private int processedCount = 0;
|
||||
private static final AtomicInteger processedCount = new AtomicInteger(0);
|
||||
|
||||
public XmiExtractor() {
|
||||
mongoDBHandler = new MongoDBHandler();
|
||||
this.bulkOperations = new ArrayList<>();
|
||||
}
|
||||
|
||||
public void extractAndUploadXmiData() throws IOException {
|
||||
InputStream resourceStream = getClass().getClassLoader().getResourceAsStream("speeches/20.zip");
|
||||
if (resourceStream == null) {
|
||||
throw new IOException("20.zip nicht gefunden im Ressourcenordner /speeches");
|
||||
}
|
||||
ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
|
||||
List<Future<?>> futures = new ArrayList<>();
|
||||
try (ZipInputStream zis = new ZipInputStream(resourceStream)) {
|
||||
ZipEntry entry;
|
||||
while ((entry = zis.getNextEntry()) != null) {
|
||||
if (entry.getName().endsWith(".xmi.gz")) {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[1024];
|
||||
int len;
|
||||
while ((len = zis.read(buffer)) > 0) {
|
||||
baos.write(buffer, 0, len);
|
||||
File tempFile = File.createTempFile("xmi_entry_", ".xmi.gz");
|
||||
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
|
||||
byte[] buffer = new byte[1024];
|
||||
int len;
|
||||
while ((len = zis.read(buffer)) > 0) {
|
||||
fos.write(buffer, 0, len);
|
||||
}
|
||||
}
|
||||
byte[] entryData = baos.toByteArray();
|
||||
processXmiGzStream(new ByteArrayInputStream(entryData), entry.getName());
|
||||
ZipEntry finalEntry = entry;
|
||||
Future<?> future = executor.submit(() -> {
|
||||
try (FileInputStream fis = new FileInputStream(tempFile)) {
|
||||
processXmiGzStream(fis, finalEntry.getName());
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
tempFile.delete();
|
||||
}
|
||||
});
|
||||
futures.add(future);
|
||||
}
|
||||
zis.closeEntry();
|
||||
}
|
||||
|
@ -61,7 +73,16 @@ public class XmiExtractor {
|
|||
Logger.error("Error reading XMI data from ZIP file.");
|
||||
Logger.error(e.getMessage());
|
||||
}
|
||||
flushBatch();
|
||||
for (Future<?> future : futures) {
|
||||
try {
|
||||
future.get();
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
executor.shutdown();
|
||||
flushBatch(); // Synchronously upload the remaining batch
|
||||
mongoDBHandler.close();
|
||||
}
|
||||
|
||||
private void processXmiGzStream(InputStream inputStream, String filename) {
|
||||
|
@ -70,12 +91,10 @@ public class XmiExtractor {
|
|||
jCas = JCasFactory.createJCas(JCAS_SPEECHES_TYPESYSTEM_DIR);
|
||||
XmiCasDeserializer.deserialize(gis, jCas.getCas(), true);
|
||||
|
||||
// Build structured analysisResults Document
|
||||
Document analysisResults = new Document();
|
||||
|
||||
// Tokens: Include POS, Lemma, etc.
|
||||
List<Document> tokens = new ArrayList<>();
|
||||
for (Token token : JCasUtil.select(jCas, Token.class)) {
|
||||
for (de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token token : JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token.class)) {
|
||||
Document tokenDoc = new Document()
|
||||
.append("text", token.getCoveredText())
|
||||
.append("pos", token.getPos().getPosValue())
|
||||
|
@ -84,15 +103,13 @@ public class XmiExtractor {
|
|||
}
|
||||
analysisResults.append("tokens", tokens);
|
||||
|
||||
// Sentences
|
||||
List<String> sentences = JCasUtil.select(jCas, Sentence.class).stream()
|
||||
.map(Sentence::getCoveredText)
|
||||
List<String> sentences = JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence.class).stream()
|
||||
.map(de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence::getCoveredText)
|
||||
.collect(Collectors.toList());
|
||||
analysisResults.append("sentences", sentences);
|
||||
|
||||
// Dependencies
|
||||
List<Document> dependencies = new ArrayList<>();
|
||||
for (Dependency dep : JCasUtil.select(jCas, Dependency.class)) {
|
||||
for (de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency dep : JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency.class)) {
|
||||
Document depDoc = new Document()
|
||||
.append("type", dep.getDependencyType())
|
||||
.append("governor", dep.getGovernor().getCoveredText())
|
||||
|
@ -101,9 +118,8 @@ public class XmiExtractor {
|
|||
}
|
||||
analysisResults.append("dependencies", dependencies);
|
||||
|
||||
// Named Entities
|
||||
List<Document> namedEntities = new ArrayList<>();
|
||||
for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
|
||||
for (de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity ne : JCasUtil.select(jCas, de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity.class)) {
|
||||
Document neDoc = new Document()
|
||||
.append("text", ne.getCoveredText())
|
||||
.append("type", ne.getValue());
|
||||
|
@ -111,23 +127,16 @@ public class XmiExtractor {
|
|||
}
|
||||
analysisResults.append("namedEntities", namedEntities);
|
||||
|
||||
// Sentiment
|
||||
List<Document> sentiments = new ArrayList<>();
|
||||
for (org.hucompute.textimager.uima.type.Sentiment sentiment :
|
||||
JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) {
|
||||
|
||||
for (org.hucompute.textimager.uima.type.Sentiment sentiment : JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) {
|
||||
Document sentimentDoc = new Document()
|
||||
.append("begin", sentiment.getBegin())
|
||||
.append("end", sentiment.getEnd())
|
||||
.append("score", sentiment.getSentiment())
|
||||
.append("subjectivity", sentiment.getSubjectivity());
|
||||
|
||||
// Check for VaderSentiment subtype
|
||||
if (sentiment instanceof org.hucompute.textimager.uima.type.VaderSentiment) {
|
||||
org.hucompute.textimager.uima.type.VaderSentiment vader =
|
||||
(org.hucompute.textimager.uima.type.VaderSentiment) sentiment;
|
||||
sentimentDoc
|
||||
.append("pos", vader.getPos())
|
||||
org.hucompute.textimager.uima.type.VaderSentiment vader = (org.hucompute.textimager.uima.type.VaderSentiment) sentiment;
|
||||
sentimentDoc.append("pos", vader.getPos())
|
||||
.append("neu", vader.getNeu())
|
||||
.append("neg", vader.getNeg());
|
||||
}
|
||||
|
@ -136,39 +145,36 @@ public class XmiExtractor {
|
|||
analysisResults.append("sentiments", sentiments);
|
||||
|
||||
List<Document> topics = new ArrayList<>();
|
||||
for (CategoryCoveredTagged topic : JCasUtil.select(jCas, CategoryCoveredTagged.class)) {
|
||||
for (org.hucompute.textimager.uima.type.category.CategoryCoveredTagged topic : JCasUtil.select(jCas, org.hucompute.textimager.uima.type.category.CategoryCoveredTagged.class)) {
|
||||
Document topicDoc = new Document()
|
||||
.append("topic", topic.getValue())
|
||||
.append("score", topic.getScore())
|
||||
.append("tags", topic.getTags())
|
||||
.append("text", topic.getCoveredText());
|
||||
topics.add(topicDoc);
|
||||
}
|
||||
topics.sort((d1, d2) -> Double.compare(d2.getDouble("score"), d1.getDouble("score")));
|
||||
analysisResults.append("topics", topics);
|
||||
|
||||
|
||||
// Upload structured Document to MongoDB
|
||||
String speechKey = extractSpeechKeyFromFilename(filename);
|
||||
if (speechKey != null) {
|
||||
Bson filter = Filters.eq("speechKey", speechKey);
|
||||
Bson update = new Document("$set", new Document("analysisResults", analysisResults));
|
||||
UpdateOneModel<Document> updateModel = new UpdateOneModel<>(filter, update);
|
||||
bulkOperations.add(updateModel);
|
||||
if (bulkOperations.size() >= BATCH_SIZE) {
|
||||
flushBatch();
|
||||
synchronized (bulkOperations) {
|
||||
if (bulkOperations.size() >= BATCH_SIZE) {
|
||||
Logger.info("BATCH_SIZE to Upload: " + bulkOperations.size());
|
||||
flushBatch();
|
||||
}
|
||||
}
|
||||
processedCount++;
|
||||
if (processedCount % 5000 == 0) {
|
||||
Logger.info("Processed speeches: " + processedCount);
|
||||
int count = processedCount.incrementAndGet();
|
||||
if (count % 1000 == 0) {
|
||||
Logger.info("Processed speeches: " + count);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
finally {
|
||||
} finally {
|
||||
if (jCas != null) {
|
||||
jCas.reset();
|
||||
}
|
||||
|
@ -180,23 +186,11 @@ public class XmiExtractor {
|
|||
return baseName.replace("20/", "");
|
||||
}
|
||||
|
||||
private void flushBatch() {
|
||||
private synchronized void flushBatch() {
|
||||
if (!bulkOperations.isEmpty()) {
|
||||
mongoDBHandler.bulkWriteNlpData(bulkOperations);
|
||||
bulkOperations.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
XmiExtractor extractor = new XmiExtractor(database);
|
||||
extractor.extractAndUploadXmiData();
|
||||
System.out.println("Processing complete.");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} */
|
||||
}
|
||||
|
||||
|
|
|
@ -89,6 +89,7 @@ public class ParlamentarierController {
|
|||
String id = ctx.pathParam("id");
|
||||
Logger.info("getParlamentarierDetails, ID = " + id);
|
||||
|
||||
// Alle Details des Abgeordnetes (Vor- und Nachname, Geburts- und Sterbeort, Partei, Vita etc.)
|
||||
ParlamentarierDetails pd = MongoPprUtils.getParlamentarierDetailsByID(id);
|
||||
|
||||
Map<String, Object> attributes = new HashMap<>();
|
||||
|
@ -101,6 +102,10 @@ public class ParlamentarierController {
|
|||
attributes.put("speechesPlaceholder", emptyList);
|
||||
}
|
||||
|
||||
// Foto des Abgeordnetes
|
||||
String picture = MongoPprUtils.getParlamentarierPictureByID(id);
|
||||
attributes.put("pic", picture);
|
||||
|
||||
ctx.render("parlamentarierDetails.ftl", attributes);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,12 +5,16 @@ import io.javalin.openapi.*;
|
|||
import org.texttechnologylab.project.gruppe_05_1.database.MongoPprUtils;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.HtmlSpeech;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.html.ParlamentarierDetails;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Token;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.nlp.Topic;
|
||||
import org.texttechnologylab.project.gruppe_05_1.domain.speech.SpeechMetaData;
|
||||
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speech;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SpeechController {
|
||||
/**
|
||||
|
@ -68,6 +72,38 @@ public class SpeechController {
|
|||
HtmlSpeech speech = MongoPprUtils.getSpeechByKey(redeId);
|
||||
attributes.put("s", speech);
|
||||
|
||||
// NLP: Topic
|
||||
if ((speech.getNlp() != null) && (speech.getNlp().getTopics() != null)) {
|
||||
Map<String, Double> topics = Topic.condenseTopicInformation(speech.getNlp().getTopics()); // Daten "verdichten"...
|
||||
// ... und ersetzen
|
||||
speech.getNlp().setTopics(
|
||||
topics.entrySet().stream()
|
||||
.map(me -> new Topic(me.getKey(), me.getValue(), null))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
// NLP: POS
|
||||
if (speech.getNlp() != null && speech.getNlp().getTokens() != null) {
|
||||
List<Token> tokens = speech.getNlp().getTokens();
|
||||
|
||||
Map<String, Integer> posCounts = Token.countPOS(tokens);
|
||||
|
||||
List<Token> posList = posCounts.entrySet().stream()
|
||||
.map(entry -> new Token(entry.getKey(), String.valueOf(entry.getValue()), "")) // Lemma remains empty
|
||||
.collect(Collectors.toList());
|
||||
|
||||
System.out.println("DEBUG: Sending POS List to NLP - " + posList);
|
||||
|
||||
speech.getNlp().setPosList((List) posList);
|
||||
|
||||
} else {
|
||||
System.out.println("DEBUG: POS List is EMPTY");
|
||||
speech.getNlp().setPosList((List) new ArrayList<Token>()); // Ensure it's never null
|
||||
}
|
||||
|
||||
// TODO: Token wird momentan etwas komisch abgespeichert, da im Attribut text die POS art steht, und in pos die Anzahl dieser POS arten. Umstrukturieren damit keine Verwirrung herrscht
|
||||
|
||||
|
||||
ctx.render("speech.ftl", attributes);
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.io.*;
|
|||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
@ -423,4 +425,61 @@ public abstract class PPRUtils {
|
|||
return fileNames;
|
||||
}
|
||||
|
||||
public static Set<org.w3c.dom.Document> checkAndProcessNewProtocols(MongoDBHandler mongoDBHandler) {
|
||||
Set<org.w3c.dom.Document> newProtocols = new HashSet<>();
|
||||
int offset = 0;
|
||||
int limit = 10;
|
||||
boolean hasMore = true;
|
||||
Pattern sessionPattern = Pattern.compile("Plenarprotokoll der (\\d+)\\. Sitzung");
|
||||
|
||||
while (hasMore) {
|
||||
String queryUrl = "https://www.bundestag.de/ajax/filterlist/de/services/opendata/866354-866354?limit="
|
||||
+ limit + "&noFilterSet=true&offset=" + offset;
|
||||
try {
|
||||
org.jsoup.nodes.Document htmlDoc = Jsoup.connect(queryUrl).get();
|
||||
Elements sessionLinks = htmlDoc.select("a.bt-link-dokument");
|
||||
if (sessionLinks.isEmpty()) break;
|
||||
|
||||
for (org.jsoup.nodes.Element link : sessionLinks) {
|
||||
String xmlUrl = link.attr("href");
|
||||
String fileName = xmlUrl.substring(xmlUrl.lastIndexOf('/') + 1); // "20212.xml"
|
||||
// Entferne die Dateiendung
|
||||
String sessionNumberFull = fileName.replace(".xml", ""); // z.B. "20212"
|
||||
String sessionNumber;
|
||||
if (sessionNumberFull.startsWith("20") && sessionNumberFull.length() > 2) {
|
||||
sessionNumber = sessionNumberFull.substring(2);
|
||||
} else {
|
||||
sessionNumber = sessionNumberFull;
|
||||
}
|
||||
if (!mongoDBHandler.sessionExists(sessionNumber)) {
|
||||
try {
|
||||
org.w3c.dom.Document xmlDoc = downloadAndParseXML(xmlUrl);
|
||||
newProtocols.add(xmlDoc);
|
||||
} catch (Exception ex) {
|
||||
Logger.error("Error processing XML for session " + sessionNumber + ": " + ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
org.jsoup.nodes.Element metaSlider = htmlDoc.selectFirst("div.meta-slider");
|
||||
if (metaSlider != null && metaSlider.hasAttr("data-nextoffset")) {
|
||||
int nextOffset = Integer.parseInt(metaSlider.attr("data-nextoffset"));
|
||||
if (nextOffset <= offset) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
offset = nextOffset;
|
||||
}
|
||||
} else {
|
||||
hasMore = false;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Logger.error("Error loading page: " + queryUrl + " : " + e.getMessage());
|
||||
break;
|
||||
}
|
||||
}
|
||||
return newProtocols;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -40,7 +40,6 @@ public class SpeechParser {
|
|||
}
|
||||
|
||||
public List<Session> parseAllSessions() {
|
||||
List<Session> sessionsEmpty = new ArrayList<>();
|
||||
List<Session> sessions = new ArrayList<>();
|
||||
this.speeches = new ArrayList<>();
|
||||
this.agendaItems = new ArrayList<>();
|
||||
|
@ -61,6 +60,26 @@ public class SpeechParser {
|
|||
|
||||
}
|
||||
|
||||
public List<Session> parseAllSessions(Set<Document> xmlDocuments) {
|
||||
List<Session> sessions = new ArrayList<>();
|
||||
this.speeches = new ArrayList<>();
|
||||
this.agendaItems = new ArrayList<>();
|
||||
Logger.info("All new sessions parsed");
|
||||
for (org.w3c.dom.Document xmlDoc : xmlDocuments) {
|
||||
try {
|
||||
File tempFile = convertDocumentToFile(xmlDoc);
|
||||
Session session = parseSessionFile(tempFile);
|
||||
sessions.add(session);
|
||||
tempFile.delete(); // Lösche die temporäre Datei nach der Verarbeitung
|
||||
} catch (Exception e) {
|
||||
Logger.error("Error parsing XML document.");
|
||||
Logger.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
return sessions;
|
||||
|
||||
}
|
||||
|
||||
private Session parseSessionFile(File file) throws Exception {
|
||||
//file = removeDoctypeAnnotation(file.getAbsolutePath());
|
||||
|
||||
|
|
|
@ -763,7 +763,7 @@ members = [
|
|||
]
|
||||
|
||||
# Base URL for querying (with placeholders for last name and first name)
|
||||
base_url = "https://bilddatenbank.bundestag.de/search/picture-result?query={0}%2C+{1}&filterQuery%5Bereignis%5D%5B%5D=Portr%C3%A4t%2FPortrait&sortVal=2"
|
||||
base_url = "https://bilddatenbank.bundestag.de/search/picture-result?query={0}+{1}&sortVal=2"
|
||||
#base_url = "https://bilddatenbank.bundestag.de/search/picture-result?filterQuery%5Bname%5D%5B%5D={0}l%2C+{1}&filterQuery%5Bereignis%5D%5B%5D=Portr%C3%A4t%2FPortrait&sortVal=2"
|
||||
|
||||
def fetch_image(lastname, firstname):
|
||||
|
|
|
@ -17,10 +17,9 @@
|
|||
<h1>${p.vorname} ${p.nachname} (${p.partei})</h1>
|
||||
</header>
|
||||
|
||||
<br>
|
||||
<br>
|
||||
<#if pic??>
|
||||
<img style="max-width: 400px; height: auto;" src="data:image/jpeg;base64,${pic}" alt="Foto von ${p.vorname} ${p.nachname} (${p.partei})" />
|
||||
<img style="max-width: 400px; height: auto;" src="data:image/jpeg;base64,${pic}" alt="Foto von ${p.vorname} ${p.nachname} (${p.partei})" />
|
||||
<#else>
|
||||
<h2>(kein Foto verfügbar)</h2>
|
||||
</#if>
|
||||
|
|
|
@ -23,11 +23,19 @@
|
|||
|
||||
<h2>Rede ${s.speechKey} </h2>
|
||||
|
||||
<main>
|
||||
<#list s.content as c>
|
||||
<#include "speechContent.ftl">
|
||||
</#list>
|
||||
</main>
|
||||
<#list s.content as c>
|
||||
<#include "speechContent.ftl">
|
||||
</#list>
|
||||
|
||||
<br><br>
|
||||
<#if s.nlp??>
|
||||
<h2>NLP Information</h2>
|
||||
<#assign nlp = "${s.nlp}">
|
||||
<#include "nlp.ftl">
|
||||
<#else>
|
||||
<h2>Keine NLP Information verfügbar für diese Rede</h2>
|
||||
</#if>
|
||||
<br> <br>
|
||||
|
||||
|
||||
</body>
|
||||
|
|
|
@ -48,4 +48,3 @@
|
|||
});
|
||||
|
||||
</script>
|
||||
|
||||
|
|
Before Width: | Height: | Size: 1.8 KiB After Width: | Height: | Size: 1.8 KiB |
Loading…
Add table
Add a link
Reference in a new issue