AI Analysis: clean extracted text to prevent JSON encoding errors
This commit is contained in:
@@ -50,13 +50,28 @@ class AIAnalysisService
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
$pdf = $this->parser->parseFile(Storage::disk('local')->path($document->file_path));
|
$pdf = $this->parser->parseFile(Storage::disk('local')->path($document->file_path));
|
||||||
return $pdf->getText();
|
$text = $pdf->getText();
|
||||||
|
return $this->cleanText($text);
|
||||||
} catch (\Exception $e) {
|
} catch (\Exception $e) {
|
||||||
Log::error("PDF Extraction Error: " . $e->getMessage());
|
Log::error("PDF Extraction Error: " . $e->getMessage());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean text to ensure it's valid UTF-8 and fits well in JSON.
|
||||||
|
*/
|
||||||
|
protected function cleanText(string $text): string
|
||||||
|
{
|
||||||
|
// Remove non-UTF8 characters
|
||||||
|
$text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
|
||||||
|
|
||||||
|
// Remove control characters (except newlines and tabs)
|
||||||
|
$text = preg_replace('/[^\x20-\x7E\xA0-\xFF\x0A\x0D\x09]/u', '', $text);
|
||||||
|
|
||||||
|
return trim($text);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Call the AI API (using a placeholder for now, or direct Http call).
|
* Call the AI API (using a placeholder for now, or direct Http call).
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user