Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/MindeeCliCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
namespace Mindee\Cli;

use Exception;
use Mindee\Error\V1\MindeeV1HttpException;
use Mindee\Input\InputSource;
use Mindee\Input\PageOptions;
use Mindee\Input\PathInput;
use Mindee\Input\UrlInputSource;
use Mindee\V1\Client;
use Mindee\V1\ClientOptions\PredictMethodOptions;
use Mindee\V1\ClientOptions\PredictOptions;
use Mindee\V1\Error\MindeeV1HttpException;
use Mindee\V1\Parsing\Common\AsyncPredictResponse;
use Mindee\V1\Parsing\Common\PredictResponse;
use Symfony\Component\Console\Command\Command;
Expand Down
84 changes: 0 additions & 84 deletions src/Image/ImageExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
use ImagickException;
use Mindee\Dependency\DependencyChecker;
use Mindee\Error\ErrorCode;
use Mindee\Error\MindeeGeometryException;
use Mindee\Error\MindeeImageException;
use Mindee\Error\MindeePdfException;
use Mindee\Geometry\BBox;
use Mindee\Geometry\BBoxUtils;
use Mindee\Geometry\Point;
use Mindee\Geometry\Polygon;
use Mindee\Input\LocalInputSource;
use Mindee\V1\Parsing\Standard\BaseField;

use function count;
use function sprintf;
Expand Down Expand Up @@ -128,20 +126,6 @@ public function getPageCount(): int
return count($this->pageImages);
}

/**
* Extract multiple images on a given page from a list of fields having position data.
*
* @param array<BaseField<string|float|integer|boolean|Polygon>> $fields List of Fields to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param null|string $outputName The base output filename, must have an image extension.
*
* @return array<ExtractedImage> a list of extracted images
*/
public function extractImagesFromPage(array $fields, int $pageIndex, ?string $outputName = null): array
{
$outputName ??= $this->filename;
return $this->extractFromPage($fields, $pageIndex, $outputName);
}

/**
* Extracts images from a page.
Expand Down Expand Up @@ -212,47 +196,6 @@ public function extractPolygonFromPage(
return new ExtractedImage($extractedImageData, $filename, $format, $pageIndex, $index);
}

/**
* Extracts a single image from a Position field.
*
* @param BaseField<string|float|integer|boolean|Polygon> $field The field to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param integer $index The index to use for naming the extracted image.
* @param string $filename The output filename.
* @param string $format The output format.
*
* @return null|ExtractedImage The extracted image, or null if the field does not have valid position data.
*
* @throws MindeeGeometryException Throws if a field does not contain positional data.
*/
public function extractImage(
BaseField $field,
int $pageIndex,
int $index,
string $filename,
string $format
): ?ExtractedImage {
$polygon = null;

if (!empty($field->polygon)) {
$polygon = $field->polygon;
} elseif (!empty($field->boundingBox)) {
$polygon = $field->boundingBox;
} elseif (!empty($field->quadrangle)) {
$polygon = $field->quadrangle;
} elseif (!empty($field->rectangle)) {
$polygon = $field->rectangle;
}

if (null === $polygon) {
throw new MindeeGeometryException(
'Provided field has no valid position data.',
ErrorCode::GEOMETRIC_OPERATION_FAILED
);
}

return $this->extractPolygonFromPage($polygon, $pageIndex, $index, $filename, $format);
}

/**
* Getter for the local input source.
Expand All @@ -262,33 +205,6 @@ public function getInputSource(): LocalInputSource
return $this->inputSource;
}

/**
* Extracts images from a page.
*
* @param array<BaseField<string|float|integer|boolean|Polygon>> $fields List of Fields to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param string $outputName Name of the created file.
* @param string $format The output format.
*
* @return array<ExtractedImage> An array of created images
*/
protected function extractFromPage(array $fields, int $pageIndex, string $outputName, string $format = 'jpg'): array
{
$extractedImages = [];

$i = 0;
foreach ($fields as $field) {
$filename = sprintf('%s_page%d-%d.%s', $outputName, $pageIndex, $i, $format);
$extractedImage = $this->extractImage($field, $pageIndex, $i, $filename, $format);
if (null !== $extractedImage) {
$extractedImages[] = $extractedImage;
}
++$i;
}

return $extractedImages;
}

/**
* Extracts an image from a set of coordinates.
*
Expand Down
57 changes: 7 additions & 50 deletions src/Pdf/PdfExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,13 @@
use Mindee\Dependency\DependencyChecker;
use Mindee\Error\MindeePdfException;
use Mindee\Input\LocalInputSource;
use Mindee\V1\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroups;
use setasign\Fpdi\Fpdi;
use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException;
use setasign\Fpdi\PdfParser\Filter\FilterException;
use setasign\Fpdi\PdfParser\PdfParserException;
use setasign\Fpdi\PdfReader\PdfReaderException;

use function count;
use function is_array;
use function sprintf;

/**
Expand All @@ -39,8 +37,8 @@ class PdfExtractor
/**
* @param LocalInputSource $localInput Local Input, accepts all compatible formats.
*
* @throws MindeePdfException Throws if PDF operations aren't supported, or if the file
* can't be read, respectively.
* @throws MindeePdfException|ImagickException Throws if PDF operations aren't supported, or if the file
* can't be read, respectively.
*/
public function __construct(LocalInputSource $localInput)
{
Expand Down Expand Up @@ -86,14 +84,14 @@ public function getPageCount(): int
/**
* Extracts sub-documents from the source document using list of page indexes.
*
* @param array<array<integer>>|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep.
* @param array<array<integer>> $pageIndexes List of sub-lists of pages to keep.
*
* @return ExtractedPdf[] list of extracted documents
*
* @throws MindeePdfException Throws if FDPF/FPDI wasn't able to handle the pdf during the extraction.
* @throws InvalidArgumentException Throws if invalid indexes are provided.
*/
public function extractSubDocuments(array|InvoiceSplitterV1InvoicePageGroups $pageIndexes): array
public function extractSubDocuments(array $pageIndexes): array
{
$extractedPdfs = [];

Expand Down Expand Up @@ -141,58 +139,17 @@ public function extractSubDocuments(array|InvoiceSplitterV1InvoicePageGroups $pa
/**
* Extracts invoices as complete PDFs from the document.
*
* @param array<array<integer>>|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep.
* @param array<array<integer>> $pageIndexes List of sub-lists of pages to keep.
* @param boolean $strict Whether to trust confidence scores or not.
*
* @return ExtractedPdf[] a list of extracted invoices
*/
public function extractInvoices(array|InvoiceSplitterV1InvoicePageGroups $pageIndexes, bool $strict = false): array
public function extractInvoices(array $pageIndexes, bool $strict = false): array
{
if (empty($pageIndexes)) {
return [];
}
if (!$strict) {
$indexes = array_map(static fn($invoicePageIndexes) => $invoicePageIndexes->pageIndexes, (array) $pageIndexes);

return $this->extractSubDocuments($indexes);
}
if (is_array($pageIndexes[0])) {
return $this->extractSubDocuments($pageIndexes);
}

$correctPageIndexes = [];
$currentList = [];
$previousConfidence = null;

$i = 0;
foreach ($pageIndexes as $pageIndex) {
$confidence = $pageIndex->confidence;
$pageList = $pageIndex->pageIndexes;

if ($confidence >= 0.5 && null === $previousConfidence) {
$currentList = $pageList;
} elseif ($confidence >= 0.5 && $i !== count($pageIndexes) - 1) {
if (!empty($currentList)) {
$correctPageIndexes[] = $currentList;
}
$currentList = $pageList;
} elseif ($confidence < 0.5 && $i === count($pageIndexes) - 1) {
$currentList = array_merge($currentList, $pageList);
if (!empty($currentList)) {
$correctPageIndexes[] = $currentList;
}
} else {
if (!empty($currentList)) {
$correctPageIndexes[] = $currentList;
}
$correctPageIndexes[] = $pageList;
}

$previousConfidence = $confidence;
++$i;
}

return $this->extractSubDocuments($correctPageIndexes);
return $this->extractSubDocuments($pageIndexes);
}

/**
Expand Down
4 changes: 2 additions & 2 deletions src/V1/Client.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
use Mindee\Error\ErrorCode;
use Mindee\Error\MindeeApiException;
use Mindee\Error\MindeeException;
use Mindee\Error\V1\MindeeV1ClientException;
use Mindee\Error\V1\MindeeV1HttpException;
use Mindee\Input\InputSource;
use Mindee\Input\LocalInputSource;
use Mindee\Input\LocalResponse;
use Mindee\Input\PageOptions;
use Mindee\V1\ClientOptions\PredictMethodOptions;
use Mindee\V1\ClientOptions\WorkflowOptions;
use Mindee\V1\Error\MindeeV1ClientException;
use Mindee\V1\Error\MindeeV1HttpException;
use Mindee\V1\Http\Endpoint;
use Mindee\V1\Http\MindeeApi;
use Mindee\V1\Http\MindeeWorkflowApi;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* Mindee Client Exceptions.
*/

namespace Mindee\Error\V1;
namespace Mindee\V1\Error;

use Mindee\Error\MindeeException;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* Mindee HTTP Exceptions.
*/

namespace Mindee\Error\V1;
namespace Mindee\V1\Error;

use Mindee\Error\ErrorCode;
use Mindee\Error\MindeeException;
Expand Down
95 changes: 94 additions & 1 deletion src/V1/Image/ImageExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,102 @@

namespace Mindee\V1\Image;

use Mindee\Error\ErrorCode;
use Mindee\Error\MindeeGeometryException;
use Mindee\Geometry\Polygon;
use Mindee\Image\ExtractedImage;
use Mindee\Image\ImageExtractor as BaseImageExtractor;
use Mindee\V1\Parsing\Standard\BaseField;

use function sprintf;

/**
* Wrapper class for V1 of the BaseImageExtractor.
*/
class ImageExtractor extends BaseImageExtractor {}
class ImageExtractor extends BaseImageExtractor
{
/**
* Extract multiple images on a given page from a list of fields having position data.
*
* @param array<BaseField<string|float|integer|boolean|Polygon>> $fields List of Fields to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param null|string $outputName The base output filename, must have an image extension.
*
* @return array<ExtractedImage> a list of extracted images
*/
public function extractImagesFromPage(array $fields, int $pageIndex, ?string $outputName = null): array
{
$outputName ??= $this->filename;
return $this->extractFromPage($fields, $pageIndex, $outputName);
}

/**
* Extracts a single image from a Position field.
*
* @param BaseField<string|float|integer|boolean|Polygon> $field The field to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param integer $index The index to use for naming the extracted image.
* @param string $filename The output filename.
* @param string $format The output format.
*
* @return null|ExtractedImage The extracted image, or null if the field does not have valid position data.
*
* @throws MindeeGeometryException Throws if a field does not contain positional data.
*/
public function extractImage(
BaseField $field,
int $pageIndex,
int $index,
string $filename,
string $format
): ?ExtractedImage {
$polygon = null;

if (!empty($field->polygon)) {
$polygon = $field->polygon;
} elseif (!empty($field->boundingBox)) {
$polygon = $field->boundingBox;
} elseif (!empty($field->quadrangle)) {
$polygon = $field->quadrangle;
} elseif (!empty($field->rectangle)) {
$polygon = $field->rectangle;
}

if (null === $polygon) {
throw new MindeeGeometryException(
'Provided field has no valid position data.',
ErrorCode::GEOMETRIC_OPERATION_FAILED
);
}

return $this->extractPolygonFromPage($polygon, $pageIndex, $index, $filename, $format);
}

/**
* Extracts images from a page.
*
* @param array<BaseField<string|float|integer|boolean|Polygon>> $fields List of Fields to extract.
* @param integer $pageIndex The page index to extract, begins at 0.
* @param string $outputName Name of the created file.
* @param string $format The output format.
*
* @return array<ExtractedImage> An array of created images
*/
protected function extractFromPage(array $fields, int $pageIndex, string $outputName, string $format = 'jpg'): array
{
$extractedImages = [];

$i = 0;
foreach ($fields as $field) {
$filename = sprintf('%s_page%d-%d.%s', $outputName, $pageIndex, $i, $format);
$extractedImage = $this->extractImage($field, $pageIndex, $i, $filename, $format);
if (null !== $extractedImage) {
$extractedImages[] = $extractedImage;
}
++$i;
}

return $extractedImages;
}

}
Loading
Loading