Added scraping of Plenarprotokoll
This commit is contained in:
parent
3902b00956
commit
cccd09bc8f
1 changed files with 86 additions and 0 deletions
|
@ -3,6 +3,9 @@ package org.texttechnologylab.project.gruppe_05_1.util;
|
|||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import com.mongodb.client.model.Indexes;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoDBHandler;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoObjectFactory;
|
||||
import org.texttechnologylab.project.gruppe_05_1.database.MongoPprUtils;
|
||||
|
@ -17,8 +20,10 @@ import org.w3c.dom.Element;
|
|||
import org.w3c.dom.Node;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -28,6 +33,8 @@ import java.util.zip.ZipInputStream;
|
|||
public abstract class PPRUtils {
|
||||
|
||||
public static final String PARTEILOS_KUERZEL = "Parteilos";
|
||||
private static Set<String> processedProtocols = new HashSet<>();
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
@ -354,4 +361,83 @@ public abstract class PPRUtils {
|
|||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
public static void processXML() {
|
||||
int offset = 0;
|
||||
int limit = 10;
|
||||
boolean hasMore = true;
|
||||
|
||||
while (hasMore) {
|
||||
String queryUrl = "https://www.bundestag.de/ajax/filterlist/de/services/opendata/866354-866354?limit="
|
||||
+ limit + "&noFilterSet=true&offset=" + offset;
|
||||
System.out.println("Lade: " + queryUrl);
|
||||
try {
|
||||
Document htmlDoc = Jsoup.connect(queryUrl).get();
|
||||
Elements xmlLinks = htmlDoc.select("a.bt-link-dokument");
|
||||
if (xmlLinks.isEmpty()) {
|
||||
System.out.println("Keine weiteren Protokolle gefunden.");
|
||||
break;
|
||||
}
|
||||
|
||||
for (org.jsoup.nodes.Element link : xmlLinks) {
|
||||
String xmlUrl = link.attr("href");
|
||||
System.out.println("Verarbeite XML: " + xmlUrl);
|
||||
try {
|
||||
org.w3c.dom.Document xmlDoc = downloadAndParseXML(xmlUrl);
|
||||
String uniqueId = xmlDoc.getDocumentElement().getAttribute("sitzung-nr");
|
||||
if (processedProtocols.contains(uniqueId)) {
|
||||
System.out.println("Protokoll bereits verarbeitet: " + uniqueId);
|
||||
continue;
|
||||
}
|
||||
|
||||
processedProtocols.add(uniqueId);
|
||||
//TODO verarbeitung
|
||||
} catch (Exception e) {
|
||||
System.err.println("Fehler beim Verarbeiten der XML-Datei: " + xmlUrl);
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
// check if next
|
||||
org.jsoup.nodes.Element metaSlider = htmlDoc.selectFirst("div.meta-slider");
|
||||
if (metaSlider != null && metaSlider.hasAttr("data-nextoffset")) {
|
||||
int nextOffset = Integer.parseInt(metaSlider.attr("data-nextoffset"));
|
||||
if (nextOffset <= offset) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
offset = nextOffset;
|
||||
}
|
||||
} else {
|
||||
hasMore = false;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
System.err.println("Fehler beim Laden der Seite: " + queryUrl);
|
||||
e.printStackTrace();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Lädt die XML-Datei von der gegebenen URL herunter und parst sie
|
||||
* mittels dbParser.
|
||||
*
|
||||
* @param xmlUrl URL der XML-Datei
|
||||
* @return Das geparste org.w3c.dom.Document
|
||||
* @throws Exception wenn ein Fehler auftritt
|
||||
*/
|
||||
public static org.w3c.dom.Document downloadAndParseXML(String xmlUrl) throws Exception {
|
||||
URL url = new URL(xmlUrl);
|
||||
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
|
||||
connection.setRequestMethod("GET");
|
||||
connection.connect();
|
||||
|
||||
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
|
||||
org.w3c.dom.Document doc = dBuilder.parse(connection.getInputStream());
|
||||
doc.getDocumentElement().normalize();
|
||||
return doc;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue