source: gs2-extensions/tesseract/trunk/src/packages/CASCADE-MAKE/TESSERACT.sh@ 34186

Last change on this file since 34186 was 34186, checked in by ak19, 4 years ago

In order to get tika + tesseract to OCR PDFs (note that tesseract can't OCR PDFs on its own), need to pass a tika-config.xml file to tika that is configured to use txt OR hocr as outputType, and if outputType=hocr then need to have the tesseract/tessdata/configs folder contain a file called hocr at minimum. Now the build process ensures that the tessdata/configs and other tessdata subfolders in the extracted tesseract source package get copied across into the GEXTTESS_INSTALLED install location. Updating the README with the notes and the tesseract bin tarball.

  • Property svn:executable set to *
File size: 2.6 KB
Line 
1#!/bin/bash
2
3package=tesseract
4version=-5.0.0
5
6progname=$0
7
8source ../cascade-make/lib/cascade-lib.bash GEXTTESS ../.. $*
9
10prefix=$GEXTTESS_INSTALLED
11
12# See imagemagick ext
13if [ "x$CROSSCONFIGURE_ARGS" != "x" ] ; then
14 echo "WARNING: Crossconfiguring not supported yet"
15fi
16
17export CFLAGS="$CFLAGS -I$GEXTTESS_INSTALLED/include"
18export CPPFLAGS="$CPPFLAGS -I$GEXTTESS_INSTALLED/include"
19export CXXFLAGS="$CXXFLAGS -I$GEXTTESS_INSTALLED/include"
20export LDFLAGS="$LDFLAGS -L$GEXTTESS_INSTALLED/lib"
21export LD_LIBRARY_PATH="$GEXTTESS_INSTALLED/lib"
22# Need PKG_CONFIG_PATH set tp leptonica's lib/pkgconfig folder (containing lept.pc file)
23export PKG_CONFIG_PATH=$GEXTTESS_INSTALLED/lib/pkgconfig
24
25opt_run_untar $force_untar $auto_untar $package $version
26
27# Need to do this for TESSERACT, before we can do configure->make->make install
28pushd $package$version;
29libtoolize
30#aclocal
31#autoheader
32sh autogen.sh
33popd
34
35opt_run_configure $force_config $auto_config $package $version $prefix \
36 --disable-shared --enable-static
37
38opt_run_make $compile $package $version
39opt_run_make $install $package $version "install"
40opt_run_make $clean $package $version "clean"
41opt_run_make $distclean $package $version "distclean"
42
43opt_run_tarclean $tarclean $package $version
44
45
46echo "Installing basic tesseract languages support (tessdata)"
47cp $GEXTTESS_DEVEL/packages/tessdata-langs.tar.gz $GEXTTESS_INSTALLED/.
48pushd $GEXTTESS_INSTALLED
49tar -xvzf tessdata-langs.tar.gz
50rm tessdata-langs.tar.gz
51mkdir -p tessdata/tessconfigs
52popd
53
54# Not sure why source package's tessdata didn't get installed in installdir
55# despite exporting TESSDATA_PREFIX at the start at cascade-make process.
56cp -r $package$version/tessdata/configs $GEXTTESS_INSTALLED/tessdata/
57cp $package$version/tessdata/eng.user-patterns $GEXTTESS_INSTALLED/tessdata/.
58cp $package$version/tessdata/eng.user-words $GEXTTESS_INSTALLED/tessdata/.
59cp $package$version/tessdata/tessconfigs/*batch* $GEXTTESS_INSTALLED/tessdata/tessconfigs/.
60cp $package$version/tessdata/tessconfigs/*demo* $GEXTTESS_INSTALLED/tessdata/tessconfigs/.
61
62
63echo "Done installing basic tesseract languages"
64echo "Visit https://github.com/tesseract-ocr/tessdata for a full list of trained language data."
65echo "To download support for any specific language(s), note the 3 letter code of that language"
66echo "Go into your $GEXTTESS_INSTALLED/tessdata and for each language run: "
67echo " wget https://github.com/tesseract-ocr/tessdata/raw/master/<3-letter-lang-code>.traineddata"
68echo "To get all languages currently supported by Tesseract, delete"
69echo "$GEXTTESS_INSTALLED/tessdata"
70echo "and in $GEXTTES_INSTALLED run:"
71echo " git clone https://github.com/tesseract-ocr/tessdata"
72echo ""
Note: See TracBrowser for help on using the repository browser.