feat: add pdfbox and poi document parser

This commit is contained in:
michael 2024-01-24 12:21:05 +08:00
parent ccc164e5ca
commit 9fac085a72
8 changed files with 166 additions and 1 deletions

View File

@ -20,6 +20,13 @@ import com.agentsflex.util.Metadata;
public class Document extends Metadata {
private String content;
public Document() {
}
public Document(String content) {
this.content = content;
}
public String getContent() {
return content;
}

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.agentsflex</groupId>
<artifactId>agents-flex-document-parser</artifactId>
<version>1.0.0-alpha.1</version>
</parent>
<artifactId>agents-flex-document-parser-pdfbox</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>com.agentsflex</groupId>
<artifactId>agents-flex-core</artifactId>
<version>1.0.0-alpha.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.30</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,22 @@
package com.agentsflex.document.parser;
import com.agentsflex.document.Document;
import com.agentsflex.document.Parser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.IOException;
import java.io.InputStream;
public class PdfBoxParser implements Parser {
@Override
public Document parse(InputStream stream) {
try (PDDocument pdfDocument = PDDocument.load(stream)) {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdfDocument);
return new Document(text);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,48 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.agentsflex</groupId>
<artifactId>agents-flex-document-parser</artifactId>
<version>1.0.0-alpha.1</version>
</parent>
<artifactId>agents-flex-document-parser-poi</artifactId>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<apache.poi.version>5.2.5</apache.poi.version>
</properties>
<dependencies>
<dependency>
<groupId>com.agentsflex</groupId>
<artifactId>agents-flex-core</artifactId>
<version>1.0.0-alpha.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${apache.poi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${apache.poi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${apache.poi.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,21 @@
package com.agentsflex.document.parser;
import com.agentsflex.document.Document;
import com.agentsflex.document.Parser;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
import java.io.IOException;
import java.io.InputStream;
public class PoiParser implements Parser {
@Override
public Document parse(InputStream stream) {
try (POITextExtractor extractor = ExtractorFactory.createExtractor(stream)) {
String text = extractor.getText();
return new Document(text);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.agentsflex</groupId>
<artifactId>parent</artifactId>
<version>1.0.0-alpha.1</version>
</parent>
<groupId>com.agentsflex</groupId>
<artifactId>agents-flex-document-parser</artifactId>
<packaging>pom</packaging>
<modules>
<module>agents-flex-document-parser-pdfbox</module>
<module>agents-flex-document-parser-poi</module>
</modules>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>

View File

@ -21,7 +21,6 @@
<groupId>com.agentsflex</groupId>
<artifactId>agents-flex-core</artifactId>
<version>1.0.0-alpha.1</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>junit</groupId>

View File

@ -47,6 +47,7 @@
<module>agents-flex-samples</module>
<module>agents-flex-llm</module>
<module>agents-flex-storage</module>
<module>agents-flex-document-parser</module>
</modules>
<properties>