tesseract ocr
This commit is contained in:
parent
2597cbeee5
commit
81bb011a34
18
pom.xml
18
pom.xml
@ -52,6 +52,12 @@
|
|||||||
<version>1.18</version>
|
<version>1.18</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.tika</groupId>
|
||||||
|
<artifactId>tika-parsers</artifactId>
|
||||||
|
<version>1.18</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
|
<!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.jai-imageio</groupId>
|
<groupId>com.github.jai-imageio</groupId>
|
||||||
@ -59,6 +65,18 @@
|
|||||||
<version>1.3.0</version>
|
<version>1.3.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.levigo.jbig2</groupId>
|
||||||
|
<artifactId>levigo-jbig2-imageio</artifactId>
|
||||||
|
<version>2.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.github.jai-imageio</groupId>
|
||||||
|
<artifactId>jai-imageio-core</artifactId>
|
||||||
|
<version>1.4.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.webjars</groupId>
|
<groupId>org.webjars</groupId>
|
||||||
<artifactId>bootstrap</artifactId>
|
<artifactId>bootstrap</artifactId>
|
||||||
|
@ -8,7 +8,6 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
import javax.servlet.http.HttpServletResponse;
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
@ -25,12 +24,14 @@ import org.apache.tika.language.LanguageIdentifier;
|
|||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.apache.tika.parser.AutoDetectParser;
|
import org.apache.tika.parser.AutoDetectParser;
|
||||||
import org.apache.tika.parser.ParseContext;
|
import org.apache.tika.parser.ParseContext;
|
||||||
|
import org.apache.tika.parser.Parser;
|
||||||
|
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||||||
|
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||||
import org.apache.tika.sax.BodyContentHandler;
|
import org.apache.tika.sax.BodyContentHandler;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
import org.springframework.core.io.ByteArrayResource;
|
import org.springframework.core.io.ByteArrayResource;
|
||||||
import org.springframework.http.HttpHeaders;
|
import org.springframework.http.HttpHeaders;
|
||||||
import org.springframework.http.MediaType;
|
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.stereotype.Controller;
|
import org.springframework.stereotype.Controller;
|
||||||
import org.springframework.ui.Model;
|
import org.springframework.ui.Model;
|
||||||
@ -50,6 +51,9 @@ public class MainController {
|
|||||||
@Value("${arjion.uploadpath}")
|
@Value("${arjion.uploadpath}")
|
||||||
private String uploadpath;
|
private String uploadpath;
|
||||||
|
|
||||||
|
@Value("${arjion.tesseractpath}")
|
||||||
|
private String tesseractpath;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public MainController(MainService mainService) {
|
public MainController(MainService mainService) {
|
||||||
this.mainService = mainService;
|
this.mainService = mainService;
|
||||||
@ -87,7 +91,15 @@ public class MainController {
|
|||||||
Path path = Paths.get(uploadpath + filename);
|
Path path = Paths.get(uploadpath + filename);
|
||||||
// Instancias necesarias
|
// Instancias necesarias
|
||||||
Metadata metadata = new Metadata();
|
Metadata metadata = new Metadata();
|
||||||
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
|
Parser parser = new AutoDetectParser(tikaConfig);
|
||||||
|
PDFParserConfig pdfConfig = new PDFParserConfig();
|
||||||
|
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
|
||||||
|
tesseractConfig.setTesseractPath(tesseractpath);
|
||||||
|
pdfConfig.setExtractInlineImages(true);
|
||||||
|
ParseContext parseContext = new ParseContext();
|
||||||
|
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
|
||||||
|
parseContext.set(PDFParserConfig.class, pdfConfig);
|
||||||
|
parseContext.set(Parser.class, parser);
|
||||||
// Usa -1 para no tener límite de 100000 chars
|
// Usa -1 para no tener límite de 100000 chars
|
||||||
ContentHandler handler = new BodyContentHandler(-1);
|
ContentHandler handler = new BodyContentHandler(-1);
|
||||||
// Castea los bytes al Stream de Tika
|
// Castea los bytes al Stream de Tika
|
||||||
@ -132,7 +144,6 @@ public class MainController {
|
|||||||
HttpHeaders responseHeaders = new HttpHeaders();
|
HttpHeaders responseHeaders = new HttpHeaders();
|
||||||
responseHeaders.add("Content-Disposition", "attachment; filename=" + URLEncoder.encode(filename, "UTF-8"));
|
responseHeaders.add("Content-Disposition", "attachment; filename=" + URLEncoder.encode(filename, "UTF-8"));
|
||||||
responseHeaders.add("Content-Type", type);
|
responseHeaders.add("Content-Type", type);
|
||||||
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders)
|
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders).body(resource);
|
||||||
.contentType(MediaType.parseMediaType("application/octet-stream")).body(resource);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -7,6 +7,7 @@ elasticsearch.nodename=arjion
|
|||||||
arjion.indexName=documentos
|
arjion.indexName=documentos
|
||||||
arjion.documentType=documento
|
arjion.documentType=documento
|
||||||
arjion.uploadpath=/upload/
|
arjion.uploadpath=/upload/
|
||||||
|
arjion.tesseractpath=/usr/bin/tesseract
|
||||||
spring.main.allow-bean-definition-overriding=true
|
spring.main.allow-bean-definition-overriding=true
|
||||||
spring.thymeleaf.enabled=true
|
spring.thymeleaf.enabled=true
|
||||||
spring.thymeleaf.prefix=classpath:/templates/
|
spring.thymeleaf.prefix=classpath:/templates/
|
||||||
|
Loading…
Reference in New Issue
Block a user