From cccd09bc8facb0c2dc13feda95ceadc6817de0c4 Mon Sep 17 00:00:00 2001 From: Picman2000 <65342372+Picman2000@users.noreply.github.com> Date: Mon, 3 Mar 2025 15:52:39 +0100 Subject: [PATCH] Added scraping of Plenarprotokoll --- .../project/gruppe_05_1/util/PPRUtils.java | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java b/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java index 902d2e1..30a75cf 100644 --- a/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java +++ b/src/main/java/org/texttechnologylab/project/gruppe_05_1/util/PPRUtils.java @@ -3,6 +3,9 @@ package org.texttechnologylab.project.gruppe_05_1.util; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import com.mongodb.client.model.Indexes; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler; import org.texttechnologylab.project.gruppe_05_1.database.MongoObjectFactory; import org.texttechnologylab.project.gruppe_05_1.database.MongoPprUtils; @@ -17,8 +20,10 @@ import org.w3c.dom.Element; import org.w3c.dom.Node; import org.xml.sax.InputSource; +import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.*; +import java.net.HttpURLConnection; import java.net.URL; import java.util.*; import java.util.stream.Collectors; @@ -28,6 +33,8 @@ import java.util.zip.ZipInputStream; public abstract class PPRUtils { public static final String PARTEILOS_KUERZEL = "Parteilos"; + private static Set processedProtocols = new HashSet<>(); + /** @@ -354,4 +361,83 @@ public abstract class PPRUtils { } }); } + + + public static void processXML() { + int offset = 0; + int limit = 10; + boolean hasMore = true; + + while (hasMore) { + String queryUrl = "https://www.bundestag.de/ajax/filterlist/de/services/opendata/866354-866354?limit=" + + limit + "&noFilterSet=true&offset=" + offset; + System.out.println("Lade: " + queryUrl); + try { + Document htmlDoc = Jsoup.connect(queryUrl).get(); + Elements xmlLinks = htmlDoc.select("a.bt-link-dokument"); + if (xmlLinks.isEmpty()) { + System.out.println("Keine weiteren Protokolle gefunden."); + break; + } + + for (org.jsoup.nodes.Element link : xmlLinks) { + String xmlUrl = link.attr("href"); + System.out.println("Verarbeite XML: " + xmlUrl); + try { + org.w3c.dom.Document xmlDoc = downloadAndParseXML(xmlUrl); + String uniqueId = xmlDoc.getDocumentElement().getAttribute("sitzung-nr"); + if (processedProtocols.contains(uniqueId)) { + System.out.println("Protokoll bereits verarbeitet: " + uniqueId); + continue; + } + + processedProtocols.add(uniqueId); + //TODO verarbeitung + } catch (Exception e) { + System.err.println("Fehler beim Verarbeiten der XML-Datei: " + xmlUrl); + e.printStackTrace(); + } + } + + // check if next + org.jsoup.nodes.Element metaSlider = htmlDoc.selectFirst("div.meta-slider"); + if (metaSlider != null && metaSlider.hasAttr("data-nextoffset")) { + int nextOffset = Integer.parseInt(metaSlider.attr("data-nextoffset")); + if (nextOffset <= offset) { + hasMore = false; + } else { + offset = nextOffset; + } + } else { + hasMore = false; + } + } catch (IOException e) { + System.err.println("Fehler beim Laden der Seite: " + queryUrl); + e.printStackTrace(); + break; + } + } + } + + /** + * Lädt die XML-Datei von der gegebenen URL herunter und parst sie + * mittels dbParser. + * + * @param xmlUrl URL der XML-Datei + * @return Das geparste org.w3c.dom.Document + * @throws Exception wenn ein Fehler auftritt + */ + public static org.w3c.dom.Document downloadAndParseXML(String xmlUrl) throws Exception { + URL url = new URL(xmlUrl); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + connection.connect(); + + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + org.w3c.dom.Document doc = dBuilder.parse(connection.getInputStream()); + doc.getDocumentElement().normalize(); + return doc; + } + }