Implemented an XMI extractor for extracting and uploading already analized data in given files.
This commit is contained in:
parent
7113637f00
commit
37c43e63b8
1 changed files with 167 additions and 0 deletions
|
@ -0,0 +1,167 @@
|
|||
package org.texttechnologylab.project.gruppe_05_1.nlp;
|
||||
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import com.mongodb.client.model.Filters;
|
||||
import com.mongodb.client.result.UpdateResult;
|
||||
import org.apache.uima.fit.util.JCasUtil;
|
||||
import org.bson.Document;
|
||||
import java.io.*;
|
||||
import java.util.zip.*;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import org.bson.conversions.Bson;
|
||||
import org.apache.uima.fit.factory.JCasFactory;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.cas.impl.XmiCasDeserializer;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
|
||||
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
|
||||
|
||||
public class XmiExtractor {
|
||||
private MongoDatabase database; // Removed 'static'
|
||||
|
||||
public XmiExtractor(MongoDatabase database) {
|
||||
this.database = database;
|
||||
}
|
||||
|
||||
public void extractAndUploadXmiData() throws IOException {
|
||||
try (ZipInputStream zis = new ZipInputStream(new FileInputStream("C:/Users/Leon/Desktop/Uni/PPR/multimodal_parliament_explorer_05_1/src/main/resources/speeches/20.zip"))) {
|
||||
ZipEntry entry;
|
||||
while ((entry = zis.getNextEntry()) != null) {
|
||||
if (entry.getName().endsWith(".xmi.gz")) {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[1024];
|
||||
int len;
|
||||
while ((len = zis.read(buffer)) > 0) {
|
||||
baos.write(buffer, 0, len);
|
||||
}
|
||||
byte[] entryData = baos.toByteArray();
|
||||
processXmiGzStream(new ByteArrayInputStream(entryData), entry.getName());
|
||||
}
|
||||
zis.closeEntry();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processXmiGzStream(InputStream inputStream, String filename) {
|
||||
try (GZIPInputStream gis = new GZIPInputStream(inputStream)) {
|
||||
JCas jCas = JCasFactory.createJCas();
|
||||
XmiCasDeserializer.deserialize(gis, jCas.getCas(), true);
|
||||
|
||||
// Build structured analysisResults Document
|
||||
Document analysisResults = new Document();
|
||||
|
||||
// Tokens: Include POS, Lemma, etc.
|
||||
List<Document> tokens = new ArrayList<>();
|
||||
for (Token token : JCasUtil.select(jCas, Token.class)) {
|
||||
Document tokenDoc = new Document()
|
||||
.append("text", token.getCoveredText())
|
||||
.append("pos", token.getPos().getPosValue())
|
||||
.append("lemma", token.getLemma().getValue());
|
||||
tokens.add(tokenDoc);
|
||||
}
|
||||
analysisResults.append("tokens", tokens);
|
||||
|
||||
// Sentences
|
||||
List<String> sentences = JCasUtil.select(jCas, Sentence.class).stream()
|
||||
.map(Sentence::getCoveredText)
|
||||
.collect(Collectors.toList());
|
||||
analysisResults.append("sentences", sentences);
|
||||
|
||||
// Dependencies
|
||||
List<Document> dependencies = new ArrayList<>();
|
||||
for (Dependency dep : JCasUtil.select(jCas, Dependency.class)) {
|
||||
Document depDoc = new Document()
|
||||
.append("type", dep.getDependencyType())
|
||||
.append("governor", dep.getGovernor().getCoveredText())
|
||||
.append("dependent", dep.getDependent().getCoveredText());
|
||||
dependencies.add(depDoc);
|
||||
}
|
||||
analysisResults.append("dependencies", dependencies);
|
||||
|
||||
// Named Entities
|
||||
List<Document> namedEntities = new ArrayList<>();
|
||||
for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
|
||||
Document neDoc = new Document()
|
||||
.append("text", ne.getCoveredText())
|
||||
.append("type", ne.getValue());
|
||||
namedEntities.add(neDoc);
|
||||
}
|
||||
analysisResults.append("namedEntities", namedEntities);
|
||||
|
||||
// Sentiment
|
||||
List<Document> sentiments = new ArrayList<>();
|
||||
for (org.hucompute.textimager.uima.type.Sentiment sentiment :
|
||||
JCasUtil.select(jCas, org.hucompute.textimager.uima.type.Sentiment.class)) {
|
||||
|
||||
Document sentimentDoc = new Document()
|
||||
.append("begin", sentiment.getBegin())
|
||||
.append("end", sentiment.getEnd())
|
||||
.append("score", sentiment.getSentiment())
|
||||
.append("subjectivity", sentiment.getSubjectivity());
|
||||
|
||||
// Check for VaderSentiment subtype
|
||||
if (sentiment instanceof org.hucompute.textimager.uima.type.VaderSentiment) {
|
||||
org.hucompute.textimager.uima.type.VaderSentiment vader =
|
||||
(org.hucompute.textimager.uima.type.VaderSentiment) sentiment;
|
||||
sentimentDoc
|
||||
.append("pos", vader.getPos())
|
||||
.append("neu", vader.getNeu())
|
||||
.append("neg", vader.getNeg());
|
||||
}
|
||||
sentiments.add(sentimentDoc);
|
||||
}
|
||||
analysisResults.append("sentiments", sentiments);
|
||||
|
||||
|
||||
// TODO Hochladen Topics implementieren
|
||||
|
||||
// Upload structured Document to MongoDB
|
||||
String speechKey = extractSpeechKeyFromFilename(filename);
|
||||
if (speechKey != null) {
|
||||
uploadToMongoDB(speechKey, analysisResults);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private static String extractSpeechKeyFromFilename(String filename) {
|
||||
String baseName = filename.replace(".xmi.gz", "");
|
||||
return baseName.replace("20/", "");
|
||||
}
|
||||
|
||||
private void uploadToMongoDB(String speechKey, Document analysisResults) {
|
||||
MongoCollection<Document> collection = database.getCollection("speech");
|
||||
Bson filter = Filters.eq("speechKey", speechKey);
|
||||
Bson update = new Document("$set", new Document("analysisResults", analysisResults));
|
||||
|
||||
UpdateResult result = collection.updateOne(filter, update);
|
||||
if (result.getModifiedCount() > 0) {
|
||||
System.out.println("Structured analysisResults updated for speechKey: " + speechKey);
|
||||
} else {
|
||||
System.out.println("Data has already been uploaded for speechKey: " + speechKey);
|
||||
}
|
||||
}
|
||||
|
||||
public MongoDatabase getDatabase() {
|
||||
return this.database;
|
||||
}
|
||||
|
||||
/*public static void main(String[] args) {
|
||||
try {
|
||||
MongoDBHandler mongoDBHandler = new MongoDBHandler();
|
||||
MongoDatabase database = mongoDBHandler.getDatabase(); // Now works!
|
||||
XmiExtractor extractor = new XmiExtractor(database);
|
||||
extractor.extractAndUploadXmiData();
|
||||
System.out.println("Processing complete.");
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} */
|
||||
}
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue