tesseract ocr
This commit is contained in:
parent
2597cbeee5
commit
81bb011a34
18
pom.xml
18
pom.xml
@ -52,6 +52,12 @@
|
||||
<version>1.18</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers</artifactId>
|
||||
<version>1.18</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
|
||||
<dependency>
|
||||
<groupId>com.github.jai-imageio</groupId>
|
||||
@ -59,6 +65,18 @@
|
||||
<version>1.3.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.levigo.jbig2</groupId>
|
||||
<artifactId>levigo-jbig2-imageio</artifactId>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.github.jai-imageio</groupId>
|
||||
<artifactId>jai-imageio-core</artifactId>
|
||||
<version>1.4.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.webjars</groupId>
|
||||
<artifactId>bootstrap</artifactId>
|
||||
|
@ -8,7 +8,6 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
@ -25,12 +24,14 @@ import org.apache.tika.language.LanguageIdentifier;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.ocr.TesseractOCRConfig;
|
||||
import org.apache.tika.parser.pdf.PDFParserConfig;
|
||||
import org.apache.tika.sax.BodyContentHandler;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.core.io.ByteArrayResource;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.stereotype.Controller;
|
||||
import org.springframework.ui.Model;
|
||||
@ -50,6 +51,9 @@ public class MainController {
|
||||
@Value("${arjion.uploadpath}")
|
||||
private String uploadpath;
|
||||
|
||||
@Value("${arjion.tesseractpath}")
|
||||
private String tesseractpath;
|
||||
|
||||
@Autowired
|
||||
public MainController(MainService mainService) {
|
||||
this.mainService = mainService;
|
||||
@ -87,7 +91,15 @@ public class MainController {
|
||||
Path path = Paths.get(uploadpath + filename);
|
||||
// Instancias necesarias
|
||||
Metadata metadata = new Metadata();
|
||||
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
|
||||
Parser parser = new AutoDetectParser(tikaConfig);
|
||||
PDFParserConfig pdfConfig = new PDFParserConfig();
|
||||
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
|
||||
tesseractConfig.setTesseractPath(tesseractpath);
|
||||
pdfConfig.setExtractInlineImages(true);
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
|
||||
parseContext.set(PDFParserConfig.class, pdfConfig);
|
||||
parseContext.set(Parser.class, parser);
|
||||
// Usa -1 para no tener límite de 100000 chars
|
||||
ContentHandler handler = new BodyContentHandler(-1);
|
||||
// Castea los bytes al Stream de Tika
|
||||
@ -132,7 +144,6 @@ public class MainController {
|
||||
HttpHeaders responseHeaders = new HttpHeaders();
|
||||
responseHeaders.add("Content-Disposition", "attachment; filename=" + URLEncoder.encode(filename, "UTF-8"));
|
||||
responseHeaders.add("Content-Type", type);
|
||||
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders)
|
||||
.contentType(MediaType.parseMediaType("application/octet-stream")).body(resource);
|
||||
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders).body(resource);
|
||||
}
|
||||
}
|
@ -7,6 +7,7 @@ elasticsearch.nodename=arjion
|
||||
arjion.indexName=documentos
|
||||
arjion.documentType=documento
|
||||
arjion.uploadpath=/upload/
|
||||
arjion.tesseractpath=/usr/bin/tesseract
|
||||
spring.main.allow-bean-definition-overriding=true
|
||||
spring.thymeleaf.enabled=true
|
||||
spring.thymeleaf.prefix=classpath:/templates/
|
||||
|
Loading…
Reference in New Issue
Block a user