mirror of
https://gitee.com/agents-flex/agents-flex.git
synced 2024-11-29 18:38:17 +08:00
feat: add pdfbox and poi document parser
This commit is contained in:
parent
ccc164e5ca
commit
9fac085a72
@ -20,6 +20,13 @@ import com.agentsflex.util.Metadata;
|
||||
public class Document extends Metadata {
|
||||
private String content;
|
||||
|
||||
public Document() {
|
||||
}
|
||||
|
||||
public Document(String content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
@ -0,0 +1,40 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>agents-flex-document-parser</artifactId>
|
||||
<version>1.0.0-alpha.1</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>agents-flex-document-parser-pdfbox</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>agents-flex-core</artifactId>
|
||||
<version>1.0.0-alpha.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.30</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -0,0 +1,22 @@
|
||||
package com.agentsflex.document.parser;
|
||||
|
||||
import com.agentsflex.document.Document;
|
||||
import com.agentsflex.document.Parser;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
public class PdfBoxParser implements Parser {
|
||||
@Override
|
||||
public Document parse(InputStream stream) {
|
||||
try (PDDocument pdfDocument = PDDocument.load(stream)) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
String text = stripper.getText(pdfDocument);
|
||||
return new Document(text);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>agents-flex-document-parser</artifactId>
|
||||
<version>1.0.0-alpha.1</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>agents-flex-document-parser-poi</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<apache.poi.version>5.2.5</apache.poi.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>agents-flex-core</artifactId>
|
||||
<version>1.0.0-alpha.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi</artifactId>
|
||||
<version>${apache.poi.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>${apache.poi.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-scratchpad</artifactId>
|
||||
<version>${apache.poi.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -0,0 +1,21 @@
|
||||
package com.agentsflex.document.parser;
|
||||
|
||||
import com.agentsflex.document.Document;
|
||||
import com.agentsflex.document.Parser;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
public class PoiParser implements Parser {
|
||||
@Override
|
||||
public Document parse(InputStream stream) {
|
||||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(stream)) {
|
||||
String text = extractor.getText();
|
||||
return new Document(text);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
27
agents-flex-document-parser/pom.xml
Normal file
27
agents-flex-document-parser/pom.xml
Normal file
@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>parent</artifactId>
|
||||
<version>1.0.0-alpha.1</version>
|
||||
</parent>
|
||||
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>agents-flex-document-parser</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<modules>
|
||||
<module>agents-flex-document-parser-pdfbox</module>
|
||||
<module>agents-flex-document-parser-poi</module>
|
||||
</modules>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
</project>
|
@ -21,7 +21,6 @@
|
||||
<groupId>com.agentsflex</groupId>
|
||||
<artifactId>agents-flex-core</artifactId>
|
||||
<version>1.0.0-alpha.1</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
|
Loading…
Reference in New Issue
Block a user