Added scraping of Plenarprotokoll

This commit is contained in:
Picman2000 2025-03-03 15:52:39 +01:00
parent 3902b00956
commit cccd09bc8f

View file

@ -3,6 +3,9 @@ package org.texttechnologylab.project.gruppe_05_1.util;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.model.Indexes;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
import org.texttechnologylab.project.gruppe_05_1.database.MongoObjectFactory;
import org.texttechnologylab.project.gruppe_05_1.database.MongoPprUtils;
@ -17,8 +20,10 @@ import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
@ -28,6 +33,8 @@ import java.util.zip.ZipInputStream;
public abstract class PPRUtils {
public static final String PARTEILOS_KUERZEL = "Parteilos";
private static Set<String> processedProtocols = new HashSet<>();
/**
@ -354,4 +361,83 @@ public abstract class PPRUtils {
}
});
}
public static void processXML() {
int offset = 0;
int limit = 10;
boolean hasMore = true;
while (hasMore) {
String queryUrl = "https://www.bundestag.de/ajax/filterlist/de/services/opendata/866354-866354?limit="
+ limit + "&noFilterSet=true&offset=" + offset;
System.out.println("Lade: " + queryUrl);
try {
Document htmlDoc = Jsoup.connect(queryUrl).get();
Elements xmlLinks = htmlDoc.select("a.bt-link-dokument");
if (xmlLinks.isEmpty()) {
System.out.println("Keine weiteren Protokolle gefunden.");
break;
}
for (org.jsoup.nodes.Element link : xmlLinks) {
String xmlUrl = link.attr("href");
System.out.println("Verarbeite XML: " + xmlUrl);
try {
org.w3c.dom.Document xmlDoc = downloadAndParseXML(xmlUrl);
String uniqueId = xmlDoc.getDocumentElement().getAttribute("sitzung-nr");
if (processedProtocols.contains(uniqueId)) {
System.out.println("Protokoll bereits verarbeitet: " + uniqueId);
continue;
}
processedProtocols.add(uniqueId);
//TODO verarbeitung
} catch (Exception e) {
System.err.println("Fehler beim Verarbeiten der XML-Datei: " + xmlUrl);
e.printStackTrace();
}
}
// check if next
org.jsoup.nodes.Element metaSlider = htmlDoc.selectFirst("div.meta-slider");
if (metaSlider != null && metaSlider.hasAttr("data-nextoffset")) {
int nextOffset = Integer.parseInt(metaSlider.attr("data-nextoffset"));
if (nextOffset <= offset) {
hasMore = false;
} else {
offset = nextOffset;
}
} else {
hasMore = false;
}
} catch (IOException e) {
System.err.println("Fehler beim Laden der Seite: " + queryUrl);
e.printStackTrace();
break;
}
}
}
/**
* Lädt die XML-Datei von der gegebenen URL herunter und parst sie
* mittels dbParser.
*
* @param xmlUrl URL der XML-Datei
* @return Das geparste org.w3c.dom.Document
* @throws Exception wenn ein Fehler auftritt
*/
public static org.w3c.dom.Document downloadAndParseXML(String xmlUrl) throws Exception {
URL url = new URL(xmlUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.connect();
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
org.w3c.dom.Document doc = dBuilder.parse(connection.getInputStream());
doc.getDocumentElement().normalize();
return doc;
}
}