tesseract ocr

This commit is contained in:
manalejandro 2018-07-15 20:02:24 +02:00
parent 2597cbeee5
commit 81bb011a34
3 changed files with 35 additions and 5 deletions

18
pom.xml
View File

@ -52,6 +52,12 @@
<version>1.18</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.18</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
<dependency>
<groupId>com.github.jai-imageio</groupId>
@ -59,6 +65,18 @@
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>2.0</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.webjars</groupId>
<artifactId>bootstrap</artifactId>

View File

@ -8,7 +8,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.Normalizer;
import java.util.ArrayList;
import javax.servlet.http.HttpServletResponse;
@ -25,12 +24,14 @@ import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
@ -50,6 +51,9 @@ public class MainController {
@Value("${arjion.uploadpath}")
private String uploadpath;
@Value("${arjion.tesseractpath}")
private String tesseractpath;
@Autowired
public MainController(MainService mainService) {
this.mainService = mainService;
@ -87,7 +91,15 @@ public class MainController {
Path path = Paths.get(uploadpath + filename);
// Instancias necesarias
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
Parser parser = new AutoDetectParser(tikaConfig);
PDFParserConfig pdfConfig = new PDFParserConfig();
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
tesseractConfig.setTesseractPath(tesseractpath);
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
parseContext.set(PDFParserConfig.class, pdfConfig);
parseContext.set(Parser.class, parser);
// Usa -1 para no tener límite de 100000 chars
ContentHandler handler = new BodyContentHandler(-1);
// Castea los bytes al Stream de Tika
@ -132,7 +144,6 @@ public class MainController {
HttpHeaders responseHeaders = new HttpHeaders();
responseHeaders.add("Content-Disposition", "attachment; filename=" + URLEncoder.encode(filename, "UTF-8"));
responseHeaders.add("Content-Type", type);
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders)
.contentType(MediaType.parseMediaType("application/octet-stream")).body(resource);
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders).body(resource);
}
}

View File

@ -7,6 +7,7 @@ elasticsearch.nodename=arjion
arjion.indexName=documentos
arjion.documentType=documento
arjion.uploadpath=/upload/
arjion.tesseractpath=/usr/bin/tesseract
spring.main.allow-bean-definition-overriding=true
spring.thymeleaf.enabled=true
spring.thymeleaf.prefix=classpath:/templates/