clean up
This commit is contained in:
@@ -5,6 +5,7 @@ namespace App\Http\Controllers\Api;
|
|||||||
use Illuminate\Http\Request;
|
use Illuminate\Http\Request;
|
||||||
use thiagoalessio\TesseractOCR\TesseractOCR;
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
||||||
use FFMpeg;
|
use FFMpeg;
|
||||||
|
use App\Enum\CurlErrorCodes;
|
||||||
|
|
||||||
class OCRController extends ApiController
|
class OCRController extends ApiController
|
||||||
{
|
{
|
||||||
@@ -30,157 +31,169 @@ class OCRController extends ApiController
|
|||||||
if ($url !== null) {
|
if ($url !== null) {
|
||||||
$data = ['ocr' => []];
|
$data = ['ocr' => []];
|
||||||
|
|
||||||
$oem = $request->get('oem');
|
$filters = $request->get('filters', ['tesseract']);
|
||||||
$digits = $request->get('digits');
|
if(is_array($filters) === false) {
|
||||||
$allowlist = $request->get('allowlist');
|
$filters = explode(',', $filters);
|
||||||
|
}
|
||||||
|
|
||||||
$tmpfname = tempnam(sys_get_temp_dir(), 'download');
|
$tesseractOEM = $request->get('tesseract.oem');
|
||||||
|
$tesseractDigits = $request->get('tesseract.digits');
|
||||||
|
$tesseractAllowlist = $request->get('tesseract.allowlist');
|
||||||
|
|
||||||
|
// Download URL
|
||||||
|
$urlDownloadFilePath = tempnam(sys_get_temp_dir(), 'download');
|
||||||
|
$maxDownloadSize = (1024 * 1024); // 1MB
|
||||||
$ch = curl_init();
|
$ch = curl_init();
|
||||||
curl_setopt($ch, CURLOPT_URL, $url);
|
curl_setopt($ch, CURLOPT_URL, $url);
|
||||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||||||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||||||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||||||
|
|
||||||
|
// We need progress updates to break the connection mid-way
|
||||||
|
curl_setopt($ch, CURLOPT_BUFFERSIZE, 128); // more progress info
|
||||||
|
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
|
||||||
|
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, function(
|
||||||
|
$downloadSize, $downloaded, $uploadSize, $uploaded
|
||||||
|
) use($maxDownloadSize) {
|
||||||
|
return ($downloaded > $maxDownloadSize) ? 1 : 0;
|
||||||
|
});
|
||||||
|
|
||||||
$curlResult = curl_exec($ch);
|
$curlResult = curl_exec($ch);
|
||||||
|
$curlError = curl_errno($ch);
|
||||||
|
$curlSize = curl_getinfo($ch, CURLINFO_CONTENT_LENGTH_DOWNLOAD);
|
||||||
curl_close($ch);
|
curl_close($ch);
|
||||||
|
if($curlError !== 0) {
|
||||||
|
$error = 'File size is larger then allowed';
|
||||||
|
if($curlError !== CurlErrorCodes::CURLE_ABORTED_BY_CALLBACK) {
|
||||||
|
$error = CurlErrorCodes::getMessage($curlError);
|
||||||
|
}
|
||||||
|
|
||||||
file_put_contents($tmpfname, $curlResult);
|
return $this->respondWithErrors(['url' => $error]);
|
||||||
|
|
||||||
// Raw OCR
|
|
||||||
$ocr = new TesseractOCR();
|
|
||||||
$ocr->image($tmpfname);
|
|
||||||
if ($oem !== null) {
|
|
||||||
$ocr->oem($oem);
|
|
||||||
}
|
}
|
||||||
if ($digits !== null) {
|
|
||||||
$ocr->digits();
|
|
||||||
}
|
|
||||||
if ($allowlist !== null) {
|
|
||||||
$ocr->allowlist($allowlist);
|
|
||||||
}
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
$data['ocr']['raw'] = $result;
|
|
||||||
|
|
||||||
$basefile_path = preg_replace('/\\.[^.\\s]{3,4}$/', '', $tmpfname);
|
// Save url file
|
||||||
|
file_put_contents($urlDownloadFilePath, $curlResult);
|
||||||
|
$urlDownloadFilePathBase = preg_replace('/\\.[^.\\s]{3,4}$/', '', $urlDownloadFilePath);
|
||||||
|
|
||||||
// Greyscale OCR
|
// tesseract (overall)
|
||||||
$result = '';
|
$ocr = null;
|
||||||
$imgcreate = imagecreatefrompng($tmpfname);
|
foreach($filters as $filterItem) {
|
||||||
if ($imgcreate !== false && imagefilter($imgcreate, IMG_FILTER_GRAYSCALE) === true) {
|
if(str_starts_with($filterItem, 'tesseract') === true) {
|
||||||
$tmpfname_greyscape = $basefile_path . '_grayscale.png';
|
$ocr = new TesseractOCR();
|
||||||
imagepng($imgcreate, $tmpfname_greyscape);
|
$ocr->image($urlDownloadFilePath);
|
||||||
$ocr->image($tmpfname_greyscape);
|
if ($tesseractOEM !== null) {
|
||||||
|
$ocr->oem($tesseractOEM);
|
||||||
|
}
|
||||||
|
if ($tesseractDigits !== null) {
|
||||||
|
$ocr->digits();
|
||||||
|
}
|
||||||
|
if ($tesseractAllowlist !== null) {
|
||||||
|
$ocr->allowlist($tesseractAllowlist);
|
||||||
|
}
|
||||||
|
$result = $ocr->run(500);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Image Filter Function
|
||||||
|
$tesseractImageFilterFunc = function($filter, $options = null) use($curlResult, $curlSize, $ocr) {
|
||||||
|
$result = '';
|
||||||
|
$img = imagecreatefromstring($curlResult);
|
||||||
|
if ($img !== false && (($options !== null && imagefilter($img, $filter, $options) === true) || ($options === null && imagefilter($img, $filter) === true))) {
|
||||||
|
$ocr->imageData($img, $curlSize);
|
||||||
|
imagedestroy($img);
|
||||||
|
|
||||||
|
$result = $ocr->run(500);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $result;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Image Scale Function
|
||||||
|
$tesseractImageScaleFunc = function($scaleFunc) use ($curlResult, $ocr) {
|
||||||
|
$result = '';
|
||||||
|
$srcImage = imagecreatefromstring($curlResult);
|
||||||
|
$srcWidth = imagesx($srcImage);
|
||||||
|
$srcHeight = imagesy($srcImage);
|
||||||
|
|
||||||
|
$dstWidth = $scaleFunc($srcWidth);
|
||||||
|
$dstHeight = $scaleFunc($srcHeight);
|
||||||
|
$dstImage = imagecreatetruecolor($dstWidth, $dstHeight);
|
||||||
|
|
||||||
|
imagecopyresampled($dstImage, $srcImage, 0, 0, 0, 0, $dstWidth, $dstHeight, $srcWidth, $srcHeight);
|
||||||
|
|
||||||
|
ob_start();
|
||||||
|
imagepng($dstImage);
|
||||||
|
$imgOutput = ob_get_contents();
|
||||||
|
ob_end_clean();
|
||||||
|
$imgSize = strlen($imgOutput);
|
||||||
|
|
||||||
|
imagedestroy($srcImage);
|
||||||
|
imagedestroy($dstImage);
|
||||||
|
|
||||||
|
$ocr->imageData($dstImage, $imgSize);
|
||||||
$result = $ocr->run(500);
|
$result = $ocr->run(500);
|
||||||
|
return $result;
|
||||||
|
};
|
||||||
|
|
||||||
|
// filter: tesseract
|
||||||
|
if(in_array('tesseract', $filters) === true) {
|
||||||
|
$data['ocr']['tesseract'] = $ocr->run(500);
|
||||||
}
|
}
|
||||||
|
|
||||||
$data['ocr']['greyscale'] = $result;
|
// filter: tesseract.grayscale
|
||||||
imagedestroy($imgcreate);
|
if (in_array('tesseract.grayscale', $filters) === true) {
|
||||||
|
$data['ocr']['tesseract.grayscale'] = $tesseractImageFilterFunc(IMG_FILTER_GRAYSCALE);
|
||||||
// Double Scale
|
|
||||||
$result = '';
|
|
||||||
$srcImage = imagecreatefrompng($tmpfname);
|
|
||||||
$srcWidth = imagesx($srcImage);
|
|
||||||
$srcHeight = imagesy($srcImage);
|
|
||||||
|
|
||||||
$dstWidth = ($srcWidth * 2);
|
|
||||||
$dstHeight = ($srcHeight * 2);
|
|
||||||
$dstImage = imagecreatetruecolor($dstWidth, $dstHeight);
|
|
||||||
|
|
||||||
// Copy and resize the original image onto the new canvas
|
|
||||||
imagecopyresampled($dstImage, $srcImage, 0, 0, 0, 0, $dstWidth, $dstHeight, $srcWidth, $srcHeight);
|
|
||||||
|
|
||||||
// Generate a temporary filename for the doubled-scale image
|
|
||||||
$tmpfname_scaled = tempnam(sys_get_temp_dir(), 'double_scale');
|
|
||||||
imagepng($dstImage, $tmpfname_scaled);
|
|
||||||
imagedestroy($srcImage);
|
|
||||||
imagedestroy($dstImage);
|
|
||||||
|
|
||||||
// OCR it
|
|
||||||
$ocr->image($tmpfname_scaled);
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
unlink($tmpfname_scaled);
|
|
||||||
$data['ocr']['double_scale'] = $result;
|
|
||||||
|
|
||||||
// Half Scale
|
|
||||||
$result = '';
|
|
||||||
$srcImage = imagecreatefrompng($tmpfname);
|
|
||||||
$srcWidth = imagesx($srcImage);
|
|
||||||
$srcHeight = imagesy($srcImage);
|
|
||||||
|
|
||||||
$dstWidth = ($srcWidth / 2);
|
|
||||||
$dstHeight = ($srcHeight / 2);
|
|
||||||
$dstImage = imagecreatetruecolor($dstWidth, $dstHeight);
|
|
||||||
|
|
||||||
// Copy and resize the original image onto the new canvas
|
|
||||||
imagecopyresampled($dstImage, $srcImage, 0, 0, 0, 0, $dstWidth, $dstHeight, $srcWidth, $srcHeight);
|
|
||||||
|
|
||||||
// Generate a temporary filename for the doubled-scale image
|
|
||||||
$tmpfname_scaled = tempnam(sys_get_temp_dir(), 'double_scale');
|
|
||||||
imagepng($dstImage, $tmpfname_scaled);
|
|
||||||
imagedestroy($srcImage);
|
|
||||||
imagedestroy($dstImage);
|
|
||||||
|
|
||||||
// OCR it
|
|
||||||
$ocr->image($tmpfname_scaled);
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
unlink($tmpfname_scaled);
|
|
||||||
$data['ocr']['half_scale'] = $result;
|
|
||||||
|
|
||||||
// EdgeDetect
|
|
||||||
$result = '';
|
|
||||||
$imgcreate = imagecreatefrompng($tmpfname);
|
|
||||||
if ($imgcreate !== false && imagefilter($imgcreate, IMG_FILTER_EDGEDETECT) === true) {
|
|
||||||
$tmpfname_edgedetect = $basefile_path . '_edgedetect.png';
|
|
||||||
imagepng($imgcreate, $tmpfname_edgedetect);
|
|
||||||
$ocr->image($tmpfname_edgedetect);
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$data['ocr']['edge_detect'] = $result;
|
// filter: tesseract.double_scale
|
||||||
imagedestroy($imgcreate);
|
if (in_array('tesseract.double_scale', $filters) === true) {
|
||||||
|
$data['ocr']['tesseract.double_scale'] = $tesseractImageScaleFunc(function($size) {
|
||||||
// Mean Removal
|
return $size * 2;
|
||||||
$result = '';
|
});
|
||||||
$imgcreate = imagecreatefrompng($tmpfname);
|
|
||||||
if ($imgcreate !== false && imagefilter($imgcreate, IMG_FILTER_MEAN_REMOVAL) === true) {
|
|
||||||
$tmpfname_edgedetect = $basefile_path . '_meanremoval.png';
|
|
||||||
imagepng($imgcreate, $tmpfname_edgedetect);
|
|
||||||
$ocr->image($tmpfname_edgedetect);
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
}
|
}
|
||||||
$data['ocr']['mean_removal'] = $result;
|
|
||||||
imagedestroy($imgcreate);
|
|
||||||
|
|
||||||
// Negate
|
// filter: tesseract.half_scale
|
||||||
$result = '';
|
if (in_array('tesseract.half_scale', $filters) === true) {
|
||||||
$imgcreate = imagecreatefrompng($tmpfname);
|
$data['ocr']['tesseract.half_scale'] = $tesseractImageScaleFunc(function($size) {
|
||||||
if ($imgcreate !== false && imagefilter($imgcreate, IMG_FILTER_NEGATE) === true) {
|
return $size / 2;
|
||||||
$tmpfname_edgedetect = $basefile_path . '_negate.png';
|
});
|
||||||
imagepng($imgcreate, $tmpfname_edgedetect);
|
|
||||||
$ocr->image($tmpfname_edgedetect);
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
}
|
}
|
||||||
$data['ocr']['negate'] = $result;
|
|
||||||
imagedestroy($imgcreate);
|
|
||||||
|
|
||||||
// Pixelate
|
// filter: tesseract.edgedetect
|
||||||
$result = '';
|
if (in_array('tesseract.edgedetect', $filters) === true) {
|
||||||
$imgcreate = imagecreatefrompng($tmpfname);
|
$data['ocr']['tesseract.edgedetect'] = $tesseractImageFilterFunc(IMG_FILTER_EDGEDETECT);
|
||||||
if ($imgcreate !== false && imagefilter($imgcreate, IMG_FILTER_PIXELATE, 3) === true) {
|
|
||||||
$tmpfname_edgedetect = $basefile_path . '_pixelate.png';
|
|
||||||
imagepng($imgcreate, $tmpfname_edgedetect);
|
|
||||||
$ocr->image($tmpfname_edgedetect);
|
|
||||||
$result = $ocr->run(500);
|
|
||||||
}
|
}
|
||||||
$data['ocr']['pixelate'] = $result;
|
|
||||||
imagedestroy($imgcreate);
|
|
||||||
|
|
||||||
// keras
|
// filter: tesseract.mean_removal
|
||||||
$cmd = 'python3 ' . base_path() . '/scripts/keras_oc.py ' . $url;
|
if (in_array('tesseract.mean_removal', $filters) === true) {
|
||||||
$command = escapeshellcmd($cmd); #no special characters it will work
|
$data['ocr']['tesseract.mean_removal'] = $tesseractImageFilterFunc(IMG_FILTER_MEAN_REMOVAL);
|
||||||
$data['ocr']['keras'] = shell_exec($command);
|
}
|
||||||
|
|
||||||
unlink($tmpfname);
|
// filter: tesseract.negate
|
||||||
|
if (in_array('tesseract.negate', $filters) === true) {
|
||||||
|
$data['ocr']['tesseract.negate'] = $tesseractImageFilterFunc(IMG_FILTER_NEGATE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// filter: tesseract.pixelate
|
||||||
|
if (in_array('tesseract.pixelate', $filters) === true) {
|
||||||
|
$data['ocr']['tesseract.pixelate'] = $tesseractImageFilterFunc(IMG_FILTER_PIXELATE, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// filter: keras
|
||||||
|
if(in_array('keras', $filters) === true) {
|
||||||
|
$cmd = '/usr/bin/python3 ' . base_path() . '/scripts/keras_oc.py ' . urlencode($url);
|
||||||
|
$command = escapeshellcmd($cmd);
|
||||||
|
$output = shell_exec($cmd);
|
||||||
|
if ($output !== null && strlen($output) > 0) {
|
||||||
|
$output = substr($output, strpos($output, '----------START----------') + 25);
|
||||||
|
} else {
|
||||||
|
$output = '';
|
||||||
|
}
|
||||||
|
$data['ocr']['keras'] = $output;
|
||||||
|
}
|
||||||
|
|
||||||
|
unlink($urlDownloadFilePath);
|
||||||
return $this->respondJson($data);
|
return $this->respondJson($data);
|
||||||
}//end if
|
}//end if
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,13 @@
|
|||||||
import io
|
|
||||||
import sys
|
import sys
|
||||||
import base64
|
import urllib.parse
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import keras_ocr
|
import keras_ocr
|
||||||
|
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
# Decode the base64-encoded image
|
url = urllib.parse.unquote(sys.argv[1])
|
||||||
img = base64.b64decode(sys.argv[1])
|
image = keras_ocr.tools.read(url)
|
||||||
img = np.array(bytearray(img), dtype=np.uint8)
|
|
||||||
|
|
||||||
# Use Keras-OCR to recognize text in the image
|
|
||||||
pipeline = keras_ocr.pipeline.Pipeline()
|
pipeline = keras_ocr.pipeline.Pipeline()
|
||||||
prediction = pipeline.recognize([img])
|
prediction = pipeline.recognize([image])
|
||||||
|
print("----------START----------")
|
||||||
# Return the recognized text
|
for text, box in prediction [0]:
|
||||||
print prediction[0][0]['text']
|
print(text)
|
||||||
|
|||||||
Reference in New Issue
Block a user