Parsing von Files implementiert.

This commit is contained in:
Picman2000 2025-03-03 18:15:21 +01:00
parent cccd09bc8f
commit b38812e325
30 changed files with 1358 additions and 4 deletions

View file

@ -11,6 +11,7 @@ import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
import org.texttechnologylab.project.gruppe_05_1.util.PropertiesUtils;
import org.texttechnologylab.project.gruppe_05_1.util.XmlUtils;
import org.texttechnologylab.project.gruppe_05_1.xml.FileObjectFactory;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.SpeechParser;
import org.w3c.dom.Element;
import java.util.List;
@ -44,9 +45,10 @@ public class Main {
private static final FileObjectFactory xmlFactory = FileObjectFactory.getFactory();
private static final MongoObjectFactory mongoFactory = MongoObjectFactory.getFactory();
private static final SpeechParser speechParser = new SpeechParser();
public static void main(String[] args) {
//TEST
speechParser.parseAllSessions();
// Stellt fest, dass alle nötigen Datenbank-Collections existieren
PPRUtils.ensureCollectionExist();

View file

@ -34,6 +34,7 @@ public abstract class PPRUtils {
public static final String PARTEILOS_KUERZEL = "Parteilos";
private static Set<String> processedProtocols = new HashSet<>();
private static Set<org.w3c.dom.Document> xmlProtocols = new HashSet<>();
@ -363,7 +364,7 @@ public abstract class PPRUtils {
}
public static void processXML() {
public static Set<org.w3c.dom.Document> processXML() {
int offset = 0;
int limit = 10;
boolean hasMore = true;
@ -385,13 +386,14 @@ public abstract class PPRUtils {
System.out.println("Verarbeite XML: " + xmlUrl);
try {
org.w3c.dom.Document xmlDoc = downloadAndParseXML(xmlUrl);
String uniqueId = xmlDoc.getDocumentElement().getAttribute("sitzung-nr");
if (processedProtocols.contains(uniqueId)) {
System.out.println("Protokoll bereits verarbeitet: " + uniqueId);
continue;
}
processedProtocols.add(uniqueId);
xmlProtocols.add(xmlDoc);
//TODO verarbeitung
} catch (Exception e) {
System.err.println("Fehler beim Verarbeiten der XML-Datei: " + xmlUrl);
@ -417,6 +419,7 @@ public abstract class PPRUtils {
break;
}
}
return xmlProtocols;
}
/**
@ -435,6 +438,13 @@ public abstract class PPRUtils {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
dBuilder.setEntityResolver((publicId, systemId) -> {
InputStream dtdStream = PPRUtils.class.getResourceAsStream("/plenarprotokolle/dbtplenarprotokoll.dtd");
if (dtdStream != null) {
return new InputSource(dtdStream);
}
return null;
});
org.w3c.dom.Document doc = dBuilder.parse(connection.getInputStream());
doc.getDocumentElement().normalize();
return doc;

View file

@ -0,0 +1,39 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.AgendaItem;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speech;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.ArrayList;
@Getter
public class AgendaItem_File_Impl implements AgendaItem {
private final int id;
private final int sessionId;
private final String title;
private final ArrayList<Speech> speeches;
public AgendaItem_File_Impl(int id, int sessionId, String title) {
this.id = id;
this.sessionId = sessionId;
this.title = title;
this.speeches = new ArrayList<>();
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.AGENDA_ITEM;
}
@Override
public void addSpeech(Speech speech) {
this.speeches.add(speech);
}
@Override
public String toHTML() {
return "";
}
}

View file

@ -0,0 +1,27 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Comment;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Content;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
@Getter
public class Comment_File_Impl implements Content, Comment {
private final int contentId;
private final int speechId;
private final String comment;
private final String commentatorName;
public Comment_File_Impl(int contentId, int speechId, String commentatorName, String comment) {
this.contentId = contentId;
this.speechId = speechId;
this.commentatorName = commentatorName;
this.comment = comment;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.SPEECH_COMMENT;
}
}

View file

@ -0,0 +1,34 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Constituency;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
@Getter
public class Constituency_File_Impl implements Constituency {
private final int id;
private final String name;
private final String federalState;
public Constituency_File_Impl(int id, String name, String federalState) {
this.id = id;
this.name = name;
this.federalState = federalState;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.CONSTIUENCY;
}
@Override
public String toHTML() {
return "<div class='constituency'>"
+ "<h2>Constituency Details</h2>"
+ "<p><strong>ID:</strong> " + id + "</p>"
+ "<p><strong>Name:</strong> " + name + "</p>"
+ "<p><strong>Federal State:</strong> " + federalState + "</p>"
+ "</div>";
}
}

View file

@ -0,0 +1,45 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Fraction;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.List;
@Getter
public class Fraction_File_Impl implements Fraction {
private String name;
private final String longName;
private final List<Integer> members;
public Fraction_File_Impl(String name, String longName, List<Integer> members) {
this.name = name;
this.longName = longName;
this.members = members;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.FRACTION;
}
@Override
public void addMember(int member) {
this.members.add(member);
}
@Override
public void updateName(String name) {
this.name = name;
}
@Override
public String toHTML() {
return "<div class='fraction'>"
+ "<h2>Fraction Details</h2>"
+ "<p><strong>Name:</strong> " + name + "</p>"
+ "<p><strong>Long Name:</strong> " + longName + "</p>"
+ "</div>";
}
}

View file

@ -0,0 +1,31 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.HistoryEntry;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
@Getter
public class HistoryEntry_File_Impl implements HistoryEntry {
private final String action;
private final String content;
private final String contentType;
private final String date;
public HistoryEntry_File_Impl(String content, String contentType, String date, String action) {
this.action = action;
this.content = content;
this.contentType = contentType;
this.date = date;
}
@Override
public String getAction() {
return this.action;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.HISTORY_ENTRY;
}
}

View file

@ -0,0 +1,28 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Individual;
@Getter
public abstract class Individual_File_Impl implements Individual {
private final String name;
private final String firstName;
private final String title;
private final String dateOfBirth;
private final String dateOfDeath;
private final String placeOfBirth;
private final String gender;
private final String religion;
public Individual_File_Impl(String name, String firstName, String title, String dateOfBirth, String dateOfDeath, String placeOfBirth, String gender, String religion) {
this.name = name;
this.firstName = firstName;
this.title = title;
this.dateOfBirth = dateOfBirth;
this.dateOfDeath = dateOfDeath;
this.placeOfBirth = placeOfBirth;
this.gender = gender;
this.religion = religion;
}
}

View file

@ -0,0 +1,37 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.LegislativePeriod;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
@Getter
public class LegislativePeriod_File_Impl implements LegislativePeriod {
private final int id;
private final String startDate;
private final String endDate;
private final String constituency;
public LegislativePeriod_File_Impl(int id, String startDate, String endDate, String constituency) {
this.id = id;
this.startDate = startDate;
this.endDate = endDate;
this.constituency = constituency;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.LEGISLATIVE_PERIOD;
}
@Override
public String toHTML() {
return "<div class='legislative-period'>"
+ "<h2>Legislative Period Details</h2>"
+ "<p><strong>ID:</strong> " + id + "</p>"
+ "<p><strong>Start Date:</strong> " + startDate + "</p>"
+ "<p><strong>End Date:</strong> " + endDate + "</p>"
+ "<p><strong>Constituency:</strong> " + constituency + "</p>"
+ "</div>";
}
}

View file

@ -0,0 +1,24 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Content;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Line;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
@Getter
public class Line_File_Impl implements Content, Line {
private final int contentId;
protected final int speechId;
private final String content;
public Line_File_Impl(int contentId, int speechId, String content) {
this.contentId = contentId;
this.speechId = speechId;
this.content = content;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.SPEECH_LINE;
}
}

View file

@ -0,0 +1,94 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.LegislativePeriod;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.MemberOfParliament;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.List;
@Getter
public class MemberOfParliament_File_Impl extends Individual_File_Impl implements MemberOfParliament {
private final int id;
private String party;
private final List<LegislativePeriod> legislativePeriods;
private final int firstLegislativePeriodId;
private final int lastLegislativePeriodId;
private final String photo;
public MemberOfParliament_File_Impl(String name, String firstName, String title, String dateOfBirth, String dateofDeath, String placeOfBirth, String gender, String religion, int id, String party, List<LegislativePeriod> legislativePeriods, int firstLegislativePeriodId, int lastLegislativePeriodId, String photo) {
super(name, firstName, title, dateOfBirth, dateofDeath, placeOfBirth, gender, religion);
this.id = id;
this.party = party;
this.legislativePeriods = legislativePeriods;
this.firstLegislativePeriodId = firstLegislativePeriodId;
this.lastLegislativePeriodId = lastLegislativePeriodId;
this.photo = photo;
}
@Override
public boolean isCurrentMember() {
return this.lastLegislativePeriodId == 20;
}
@Override
public void updateParty(String party) {
this.party = party;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.MEMBER_OF_PARLIAMENT;
}
@Override
public String toHTML() {
StringBuilder html = new StringBuilder();
// Basic Member details
html.append("<div class='member-of-parliament'>")
.append("<h2>Member of Parliament Details</h2>")
.append("<p><strong>ID:</strong> ").append(id).append("</p>")
.append("<p><strong>Name:</strong> ").append(getName()).append(" ").append(getFirstName()).append("</p>")
.append("<p><strong>Party:</strong> ").append(party).append("</p>");
// Member photo (constructed from last name and first name)
String photoPath = "../src/memberphotos/" + getName() + "_" + getFirstName() + ".jpg";
html.append("<p><img src='").append(photoPath).append("' alt='Photo' width='100' /></p>");
// Legislative Periods - First and Last Period
html.append("<h3>Legislative Periods</h3>");
if (legislativePeriods == null || legislativePeriods.isEmpty()) {
if (firstLegislativePeriodId == lastLegislativePeriodId) {
html.append("<p>Member of Parliament during the legislative period ")
.append(firstLegislativePeriodId);
} else {
html.append("<p>Candidated for the first time during the legislative period ")
.append(firstLegislativePeriodId)
.append(" and the last time during the legislative period ")
.append(lastLegislativePeriodId);
}
if (isCurrentMember()) {
html.append(" <p><b>(currently active)</b></p>");
}
html.append("</p>");
} else {
// First Legislative Period
LegislativePeriod firstPeriod = legislativePeriods.get(0);
html.append("<p><strong>First Period:</strong> ").append(firstPeriod.getStartDate())
.append(" to ").append(firstPeriod.getEndDate()).append("</p>");
// Last Legislative Period
LegislativePeriod lastPeriod = legislativePeriods.get(legislativePeriods.size() - 1);
html.append("<p><strong>Last Period:</strong> ").append(lastPeriod.getStartDate())
.append(" to ").append(lastPeriod.getEndDate()).append("</p>");
}
html.append("</div>");
return html.toString();
}
}

View file

@ -0,0 +1,47 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.AgendaItem;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Session;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.ArrayList;
import java.util.List;
@Getter
public class Session_File_Impl implements Session {
private final int id;
private final String legislativePeriod;
private final String dateTime;
private final String endTime;
private final List<AgendaItem> agendaItems;
public Session_File_Impl(String legislativePeriod, int id, String dateTime, String endTime) {
this.id = id;
this.legislativePeriod = legislativePeriod;
this.dateTime = dateTime;
this.endTime = endTime;
this.agendaItems = new ArrayList<>();
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.SESSION;
}
@Override
public void addAgendaItem(AgendaItem item) {
this.agendaItems.add(item);
}
@Override
public String toHTML() {
return "<div class='session'>"
+ "<h2>Session Details</h2>"
+ "<p><strong>ID:</strong> " + id + "</p>"
+ "<p><strong>Legislative Period:</strong> " + legislativePeriod + "</p>"
+ "<p><strong>Start Time:</strong> " + dateTime + "</p>"
+ "<p><strong>End Time:</strong> " + endTime + "</p>"
+ "</div>";
}
}

View file

@ -0,0 +1,29 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Content;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speaker;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
@Getter
public class Speaker_File_Impl implements Content, Speaker {
private final int contentId;
private final int speechId;
private final int speakerId;
private final String speakerName;
private final String fraction;
public Speaker_File_Impl(int contentId, int speechId, int speakerId, String speakerName, String fraction) {
this.contentId = contentId;
this.speechId = speechId;
this.speakerId = speakerId;
this.speakerName = speakerName;
this.fraction = fraction;
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.SPEECH_SPEAKER;
}
}

View file

@ -0,0 +1,54 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls;
import lombok.Getter;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Content;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speech;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.ArrayList;
import java.util.List;
@Getter
public class Speech_File_Impl implements Speech {
private final int sessionId;
private final int agendaItemId;
private final int speakerId;
private final int speechId;
private final String speakerName;
private final String fraction;
private final List<Content> speechContents;
public Speech_File_Impl(int sessionId, int agendaItemId, int speechId, int speakerId, String speakerName, String fraction) {
this.speakerId = speakerId;
this.agendaItemId = agendaItemId;
this.speechId = speechId;
this.speakerName = speakerName;
this.fraction = fraction;
this.sessionId = sessionId;
this.speechContents = new ArrayList<>();
}
@Override
public MongoDBEntryType getType() {
return MongoDBEntryType.SPEECH;
}
public void addContent(Content content) {
this.speechContents.add(content);
//Logger.pink("Added paragraph to speech: " + paragraph);
}
@Override
public String toHTML() {
//TODO: Implement a logic that highlights the lines that the speaker is saying
return "<div class='speech'>"
+ "<h2>Speech Details</h2>"
+ "<p><strong>Session ID:</strong> " + sessionId + "</p>"
+ "<p><strong>Speaker ID:</strong> " + speakerId + "</p>"
+ "<p><strong>Speaker Name:</strong> " + speakerName + "</p>"
+ "<p><strong>Fraction:</strong> " + fraction + "</p>"
+ "</div>";
}
}

View file

@ -0,0 +1,56 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.ArrayList;
public interface AgendaItem {
/**
* Returns the ID of the agenda item.
*
* @return The ID of the agenda item.
*/
int getId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the session ID.
*
* @return The session ID.
*/
int getSessionId();
/**
* Returns the title of the agenda item.
*
* @return The title of the agenda item.
*/
String getTitle();
/**
* Returns the speeches of the agenda item.
*
* @return The speeches of the agenda item.
*/
ArrayList<Speech> getSpeeches();
/**
* Adds a speech to the agenda item.
*
* @param speech The speech to add.
*/
void addSpeech(Speech speech);
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,35 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface Comment extends Content {
/**
* Returns the content ID.
*
* @return The content ID.
*/
int getContentId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the comment.
*
* @return the comment
*/
String getComment();
/**
* Returns the commentator name.
*
* @return the commentator name
*/
String getCommentatorName();
}

View file

@ -0,0 +1,38 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface Constituency {
/**
* Returns the id of the constituency.
*
* @return the id of the constituency
*/
int getId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the name of the constituency.
*
* @return the name of the constituency
*/
String getName();
/**
* Returns the federal state of the constituency.
*
* @return the federal state of the constituency
*/
String getFederalState();
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,27 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface Content {
/**
* Returns the content ID.
*
* @return The content ID.
*/
int getContentId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the speech ID.
*
* @return The speech ID.
*/
int getSpeechId();
}

View file

@ -0,0 +1,55 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.List;
public interface Fraction {
/**
* Returns the name of the fraction.
*
* @return The name of the fraction.
*/
String getName();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the long name of the fraction.
*
* @return The long name of the fraction.
*/
String getLongName();
/**
* Returns the members of the fraction.
*
* @return The members of the fraction.
*/
List<Integer> getMembers();
/**
* Adds a member to the fraction.
*
* @param member The member to add.
*/
void addMember(int member);
/**
* Updates the name of the fraction.
*
* @param name The new name of the fraction.
*/
void updateName(String name);
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,26 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface HistoryEntry {
/**
* Returns the action of the history entry.
**/
String getAction();
/**
* Returns the content of the history entry.
**/
Object getContent();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the date of the history entry.
**/
String getDate();
}

View file

@ -0,0 +1,75 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface Individual {
/**
* Returns the name
*
* @return the name
*/
String getName();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the first name
*
* @return the first name
*/
String getFirstName();
/**
* Returns the title
*
* @return the title
*/
String getTitle();
/**
* Returns the date of birth
*
* @return the date of birth
*/
String getDateOfBirth();
/**
* Returns the date of death
*
* @return the date of death
*/
String getDateOfDeath();
/**
* Returns the place of birth
*
* @return the place of birth
*/
String getPlaceOfBirth();
/**
* Returns the gender
*
* @return the gender
*/
String getGender();
/**
* Returns the religion
*
* @return the religion
*/
String getReligion();
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,47 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface LegislativePeriod {
/**
* Returns the id of the legislative period.
*
* @return the id of the legislative period
*/
int getId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the number of the legislative period.
*
* @return the number of the legislative period
*/
String getStartDate();
/**
* Returns the start date of the legislative period.
*
* @return the start date of the legislative period
*/
String getEndDate();
/**
* Returns the end date of the legislative period.
*
* @return the end date of the legislative period
*/
String getConstituency();
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,27 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface Line extends Content {
/**
* Returns the content ID.
*
* @return The content ID.
*/
int getContentId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the content of the line.
*
* @return the content of the line
*/
String getContent();
}

View file

@ -0,0 +1,77 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.List;
public interface MemberOfParliament extends Individual {
/**
* Returns the id of the member of parliament.
*
* @return the id of the member of parliament
*/
int getId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the party of the member of parliament.
*
* @return the party of the member of parliament
*/
String getParty();
/**
* Returns the legislative periods of the member of parliament.
*
* @return the legislative periods of the member of parliament
*/
List<LegislativePeriod> getLegislativePeriods();
/**
* Returns the first legislative period id of the member of parliament.
*
* @return the first legislative period id of the member of parliament
*/
int getFirstLegislativePeriodId();
/**
* Returns the last legislative period id of the member of parliament.
*
* @return the last legislative period id of the member of parliament
*/
int getLastLegislativePeriodId();
/**
* Returns the base64 encoded string of the photo of the member of parliament.
*
* @return the base64 encoded string of the photo of the member of parliament
*/
String getPhoto();
/**
* Returns whether the member of parliament is a current member.
*
* @return whether the member of parliament is a current member
*/
boolean isCurrentMember();
/**
* Updates the party of the member of parliament.
*
* @param party the new party of the member of parliament
*/
void updateParty(String party);
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,62 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.List;
public interface Session {
/**
* Returns the id of the session.
*
* @return the id of the session
*/
int getId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the legislative period of the session.
*
* @return the legislative period of the session
*/
String getLegislativePeriod();
/**
* Returns the date and time of the session.
*
* @return the date and time of the session
*/
String getDateTime();
/**
* Returns the end time of the session.
*
* @return the end time of the session
*/
String getEndTime();
/**
* Returns the agenda items of the session.
*
* @return the agenda items of the session
*/
List<AgendaItem> getAgendaItems();
/**
* Adds an agenda item to the session.
* @param item The agenda item to add.
*/
void addAgendaItem(AgendaItem item);
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,41 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
public interface Speaker extends Content {
/**
* Returns the content ID.
*
* @return The content ID.
*/
int getContentId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the speakerId.
*
* @return the speakerId
*/
int getSpeakerId();
/**
* Returns the speakerName.
*
* @return the speakerName
*/
String getSpeakerName();
/**
* Returns the fraction.
*
* @return the fraction
*/
String getFraction();
}

View file

@ -0,0 +1,75 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums.MongoDBEntryType;
import java.util.List;
public interface Speech {
/**
* Returns the speech ID.
*
* @return The speech ID.
*/
int getSpeechId();
/**
* Returns the type of the MongoDB entry.
*
* @return The type of the MongoDB entry.
*/
MongoDBEntryType getType();
/**
* Returns the agenda item ID.
*
* @return The agenda item ID.
*/
int getAgendaItemId();
/**
* Returns the speaker ID.
*
* @return The speaker ID.
*/
int getSpeakerId();
/**
* Returns the speaker name.
*
* @return The speaker name.
*/
String getSpeakerName();
/**
* Returns the fraction.
*
* @return The fraction.
*/
String getFraction();
/**
* Returns the session ID.
*
* @return The session ID.
*/
int getSessionId();
/**
* Returns the speech contents.
*
* @return The speech contents.
*/
List<Content> getSpeechContents();
/**
* Adds a speaker to the speech.
* @param speaker The speaker to add.
*/
void addContent(Content speaker);
/**
* Returns the HTML representation of the agenda item.
*
* @return The HTML representation of the agenda item.
*/
String toHTML();
}

View file

@ -0,0 +1,188 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches;
import lombok.Getter;
import lombok.Setter;
import org.texttechnologylab.project.gruppe_05_1.util.PPRUtils;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Impls.*;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.AgendaItem;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Session;
import org.texttechnologylab.project.gruppe_05_1.xml.speeches.Interfaces.Speech;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
public class SpeechParser {
@Getter
private List<Speech> speeches;
@Getter
private List<AgendaItem> agendaItems;
@Setter
private Boolean parseLegislativePeriods;
public List<Session> parseAllSessions() {
List<Session> sessions = new ArrayList<>();
this.speeches = new ArrayList<>();
this.agendaItems = new ArrayList<>();
//TODO Logik so machen dass aus array von xmls gelesen wird nicht aus pfad
Set<Document> xmlDocuments = PPRUtils.processXML();
for (org.w3c.dom.Document xmlDoc:xmlDocuments) {
try {
File tempFile = convertDocumentToFile(xmlDoc);
Session session = parseSessionFile(tempFile);
sessions.add(session);
tempFile.delete(); // Lösche die temporäre Datei nach der Verarbeitung
} catch (Exception e) {
System.err.println("Error parsing XML document.");
e.printStackTrace();
}
}
return sessions;
}
private Session parseSessionFile(File file) throws Exception {
//file = removeDoctypeAnnotation(file.getAbsolutePath());
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document document = builder.parse(file);
// Extract session details
Element root = document.getDocumentElement();
String legislativePeriod = root.getAttribute("wahlperiode");
int sessionId = Integer.parseInt(root.getAttribute("sitzung-nr"));
System.out.println("Session " + sessionId + " wurde gespeichert");
String sessionDate = root.getAttribute("sitzung-datum");
Element startTimeElement = (Element) root.getElementsByTagName("sitzungsbeginn").item(0);
String startTimeString = startTimeElement != null ? startTimeElement.getAttribute("sitzung-start-uhrzeit") : null;
String startTime = startTimeString != null ? sessionDate + " " + startTimeString : sessionDate;
Element sessionEndElement = (Element) root.getElementsByTagName("sitzungsende").item(0);
String sessionEndTime = sessionEndElement != null ? sessionEndElement.getAttribute("sitzung-ende-uhrzeit") : null;
Session_File_Impl session = new Session_File_Impl(legislativePeriod, sessionId, startTime, sessionEndTime);
// Parse agenda items
NodeList agendaNodes = document.getElementsByTagName("tagesordnungspunkt");
for (int agendaItemId = 0; agendaItemId < agendaNodes.getLength(); agendaItemId++) {
Element agendaElement = (Element) agendaNodes.item(agendaItemId);
if (agendaElement == null) continue;
String agendaTitle = agendaElement.getAttribute("top-id");
AgendaItem_File_Impl agendaItemFileImpl = new AgendaItem_File_Impl(agendaItemId, sessionId, agendaTitle);
this.agendaItems.add(agendaItemFileImpl);
// Parse speeches
NodeList speechNodes = agendaElement.getElementsByTagName("rede");
for (int speechId = 0; speechId < speechNodes.getLength(); speechId++) {
Element speechElement = (Element) speechNodes.item(speechId);
if (speechElement == null) continue;
// Parse speaker
Element speakerElement = (Element) speechElement.getElementsByTagName("redner").item(0);
if (speakerElement == null) continue;
int speakerId = Integer.parseInt(speakerElement.getAttribute("id"));
Element nameElement = (Element) speakerElement.getElementsByTagName("name").item(0);
if (nameElement == null) continue;
String title = getOptionalTextContent(nameElement, "titel");
String firstName = getOptionalTextContent(nameElement, "vorname");
String lastName = getOptionalTextContent(nameElement, "nachname");
String fraction = getOptionalTextContent(nameElement, "fraktion");
String speakerName = (title != null ? title + " " : "") + firstName + " " + lastName;
Speech_File_Impl speech = new Speech_File_Impl(sessionId, agendaItemId, speechId, speakerId, speakerName, fraction);
// Add the speaker to speech contents
speech.addContent(new Speaker_File_Impl(0, speechId, speakerId, speakerName, fraction));
// Parse content in order
NodeList contentNodes = speechElement.getChildNodes();
for (int k = 0; k < contentNodes.getLength(); k++) {
Node contentNode = contentNodes.item(k);
if (contentNode == null || contentNode.getNodeType() != Node.ELEMENT_NODE) continue;
Element contentElement = (Element) contentNode;
String tagName = contentElement.getTagName();
switch (tagName) {
case "p":
String paragraphClass = contentElement.getAttribute("klasse");
if ("redner".equals(paragraphClass)) {
// This case can be skipped as the speaker is already added
continue;
} else if ("kommentar".equals(paragraphClass)) {
String comment = contentElement.getTextContent().trim();
String commentatorName = ""; // Extract if present
speech.addContent(new Comment_File_Impl(k + 1, speechId, commentatorName, comment));
} else {
String line = contentElement.getTextContent().trim();
speech.addContent(new Line_File_Impl(k + 1, speechId, line));
}
break;
case "kommentar":
String comment = contentElement.getTextContent().trim();
String commentatorName = ""; // Extract if available
speech.addContent(new Comment_File_Impl(k + 1, speechId, commentatorName, comment));
break;
default:
break;
}
}
agendaItemFileImpl.addSpeech(speech);
this.speeches.add(speech);
}
session.addAgendaItem(agendaItemFileImpl);
}
return session;
}
private static String getOptionalTextContent(Element parent, String tagName) {
NodeList nodes = parent.getElementsByTagName(tagName);
if (nodes.getLength() > 0) {
Node node = nodes.item(0);
if (node != null) {
return node.getTextContent().trim();
}
}
return null;
}
/**
* Konvertiert ein org.w3c.dom.Document in eine temporäre Datei.
*/
private File convertDocumentToFile(org.w3c.dom.Document xmlDoc) throws Exception {
File tempFile = File.createTempFile("session_", ".xml");
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
DOMSource source = new DOMSource(xmlDoc);
StreamResult result = new StreamResult(tempFile);
transformer.transform(source, result);
return tempFile;
}
}

View file

@ -0,0 +1,17 @@
package org.texttechnologylab.project.gruppe_05_1.xml.speeches.enums;
public enum MongoDBEntryType {
AGENDA_ITEM,
CONSTIUENCY,
FRACTION,
HISTORY_ENTRY,
INDIVIDUAL,
LEGISLATIVE_PERIOD,
MEMBER_OF_PARLIAMENT,
SESSION,
SPEECH,
SPEECH_COMMENT,
SPEECH_CONTENT,
SPEECH_LINE,
SPEECH_SPEAKER,
}