tesseract context trouble
This commit is contained in:
parent
d99f758e41
commit
01f386732d
@ -54,6 +54,9 @@ public class MainController {
|
|||||||
@Value("${arjion.tesseractpath}")
|
@Value("${arjion.tesseractpath}")
|
||||||
private String tesseractpath;
|
private String tesseractpath;
|
||||||
|
|
||||||
|
@Value("${arjion.tesseractdatapath}")
|
||||||
|
private String tesseractdatapath;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public MainController(MainService mainService) {
|
public MainController(MainService mainService) {
|
||||||
this.mainService = mainService;
|
this.mainService = mainService;
|
||||||
@ -95,18 +98,18 @@ public class MainController {
|
|||||||
PDFParserConfig pdfConfig = new PDFParserConfig();
|
PDFParserConfig pdfConfig = new PDFParserConfig();
|
||||||
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
|
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
|
||||||
tesseractConfig.setTesseractPath(tesseractpath);
|
tesseractConfig.setTesseractPath(tesseractpath);
|
||||||
|
tesseractConfig.setTessdataPath(tesseractdatapath);
|
||||||
tesseractConfig.setLanguage("spa+eng");
|
tesseractConfig.setLanguage("spa+eng");
|
||||||
pdfConfig.setExtractInlineImages(true);
|
pdfConfig.setExtractInlineImages(true);
|
||||||
ParseContext parseContext = new ParseContext();
|
ParseContext parseContext = new ParseContext();
|
||||||
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
|
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
|
||||||
parseContext.set(PDFParserConfig.class, pdfConfig);
|
parseContext.set(PDFParserConfig.class, pdfConfig);
|
||||||
parseContext.set(Parser.class, parser);
|
|
||||||
// Usa -1 para no tener límite de 100000 chars
|
// Usa -1 para no tener límite de 100000 chars
|
||||||
ContentHandler handler = new BodyContentHandler(-1);
|
ContentHandler handler = new BodyContentHandler(-1);
|
||||||
// Castea los bytes al Stream de Tika
|
// Castea los bytes al Stream de Tika
|
||||||
TikaInputStream stream = TikaInputStream.get(bytes);
|
TikaInputStream stream = TikaInputStream.get(bytes);
|
||||||
// Parsea el contenido
|
// Parsea el contenido
|
||||||
parser.parse(stream, handler, metadata, new ParseContext());
|
parser.parse(stream, handler, metadata, parseContext);
|
||||||
// Identifica el idioma del archivo
|
// Identifica el idioma del archivo
|
||||||
LanguageIdentifier identifier = new LanguageIdentifier(handler.toString());
|
LanguageIdentifier identifier = new LanguageIdentifier(handler.toString());
|
||||||
// Almacena en elasticsearch
|
// Almacena en elasticsearch
|
||||||
|
@ -7,7 +7,8 @@ elasticsearch.nodename=arjion
|
|||||||
arjion.indexName=documentos
|
arjion.indexName=documentos
|
||||||
arjion.documentType=documento
|
arjion.documentType=documento
|
||||||
arjion.uploadpath=/upload/
|
arjion.uploadpath=/upload/
|
||||||
arjion.tesseractpath=/usr/bin/tesseract
|
arjion.tesseractpath=/usr/bin
|
||||||
|
arjion.tesseractdatapath=/usr/share/tesseract-ocr
|
||||||
spring.main.allow-bean-definition-overriding=true
|
spring.main.allow-bean-definition-overriding=true
|
||||||
spring.thymeleaf.enabled=true
|
spring.thymeleaf.enabled=true
|
||||||
spring.thymeleaf.prefix=classpath:/templates/
|
spring.thymeleaf.prefix=classpath:/templates/
|
||||||
|
Loading…
Reference in New Issue
Block a user