Sample codes for parser contracts
- Updated: 2023/10/16
Sample codes for parser contracts
As a CoE Lead, you must provide input and output contract for the parser package to integrate it with Document Automation.
Following are the sample codes for input and output contracts:
Input contract
package aws;
import com.automationanywhere.botcommand.data.impl.StringValue;
import com.automationanywhere.commandsdk.annotations.*;
import com.automationanywhere.commandsdk.annotations.rules.LocalFile;
import com.automationanywhere.commandsdk.annotations.rules.NotEmpty;
import com.automationanywhere.commandsdk.model.AttributeType;
import com.automationanywhere.core.security.SecureString;
import static com.automationanywhere.commandsdk.model.DataType.STRING;
@BotCommand
@CommandPkg(
name = "ExtractionCommand",
label = "Extraction Command",
description = "Extraction Command",
node_label = "Extraction Command",
return_type = STRING,
return_label = "Extraction Command Response",
minimum_botagent_version = "21.98",
minimum_controlroom_version = "10520")
public class ExtractionCommand {
@Execute
public StringValue compute(
@Idx(index = "1", type = AttributeType.FILE)
@LocalFile
@Pkg(label = "Image File Path")
@NotEmpty
final String inputFilePath,
@Idx(index = "2", type = AttributeType.CREDENTIAL) @Pkg(label = "Service Account")
final SecureString serviceAccount) {
String secureString = serviceAccount.getInsecureString();
return new StringValue("");
}
}
Output contract
The response from the command must be in JSON format with appropriate schema with
which Document Automation can work. Following is the schema for JSON
output generated out of the command:
{
"metadata": {
"documentId": "unique Id used to Indentify document in DA",
"filepath": "Input File Path that refers to the Path used by Bot runner during execution",
"executionStatus": {
"statusCode": "Status code which indicates the response post extraction",
"statusMessage": "status message which indicates the response post extraction ",
"message": "Message that details the result"
},
"timeInMs": "Time taken in ms to process the entire document",
"clusterId": "unique Id used to capture Heuristic feedback",
"numberOfPages": "Number pages in the document"
},
"imagePreprocessingResult": {
"metadata": {
"documentId": "unique Id used to Indentify document in DA",
"filepath": "Input File Path that refers to the Path used by Bot runner during execution",
"executionStatus": {
"statusCode": "Status code which indicates the response post extraction",
"statusMessage": "status message which indicates the response post extraction ",
"message": "Message that details the result"
},
"timeInMs": "Time taken in ms to process the entire document"
},
"pages": [
{
"filepath": "File Path that refers to the Page in the document post splitting the document into pages",
"deskew": true,
"orientation": true,
"renderDpi": 0,
"width": "width of the Page",
"height": "height of the Page"
}
]
},
"ocrResult": {
"metadata": {
"executionStatus": {
"statusCode": "Status code which indicates the response post extraction",
"statusMessage": "status message which indicates the response post extraction ",
"message": "Message that details the result"
},
"numberOfPages": "Number pages in the document",
"learningInstanceSetting": {
"provider": "Name of the provider",
"version": "Provider Version",
"langCodes": [
"Language code in DA"
]
},
"timeInMs": "time taken in ms to get the OCR result for the document",
"pages": [
{
"id": "unqiue Id to indetify the page in the document",
"pageNum": "Page number in the document",
"filepath": "File Path that refers to the Page in the document post splitting the document into pages",
"langCode": "Code that refers to the language in DA"
}
]
},
"blocks": [
{
"id": "Block Id of the word segment",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"text": "text from the segment",
"blockType": "Type of the Block.
Possible options :
WORD/LINE/INFO_BLOCK/KEY_VAL_BLOCK/TABLE/TABLE_HEADER
/TABLE_HEADER_ELEM/COLUMN/KEY_INFO_BLOCK/NO_OBJECT",
"pageNum": "Page number in the document",
"confidence": "OCR Confidence"
}
]
},
"docDetectResult": {
"metadata": {
"executionStatus": {
"statusCode": "Status code which indicates the response post extraction",
"statusMessage": "status message which indicates the response post extraction ",
"message": "Message that details the result"
},
"timeInMs": "Time taken in ms to process the entire document"
},
"featureObjects": [
{
"id": "Feature object UUID",
"blockType": "INFO_BLOCK",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"text": "Feature object Text",
"confidence": "confidence",
"ocrConfidence": "OCR Confidence ",
"pageNum": "Page number in the document"
}
]
},
"extractionResult": {
"metadata": {
"filepath": "Input File Path that refers to the Path used by Bot runner during execution",
"executionStatus": {
"statusCode": "Status code which indicates the response post extraction",
"statusMessage": "status message which indicates the response post extraction ",
"message": "Message that details the result"
},
"timeInMs": Time taken in ms to process the entire document",
"pages": [
{
"id": "unqiue Id to indetify the page in the document",
"pageNum": "Page number in the document",
"filepath": "File Path that refers to the Page in the document post splitting the document into pages",
"width": "width of the Page",
"height": "height of the Page"
"langCode": "Code that refers to the language in DA"
}
]
},
"keyValueFeatures": [
{
"id": "unique Id to indetify the keyValue found in the document",
"domainFieldKey": "field that needs to be extracted",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"text": "text from the segment",
"pageNum": "Page number in the document",
"ocrConfidence": 0.909,
"extractionScore": "OCR Confidence",
"key": {
"id": "unique Id to indetify the key found in the document",
"text": "text from the segment",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"extractionScore": "Extraction score"
},
"value": {
"id": "unique Id to indetify the value found in the document",
"text": "text from the segment",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"extractionScore": "Extraction score"
},
"extractedDataType": "Data type of the extracted field"
}
],
"tableFeatures": [
{
"id": "unique Id to indetify the table in the document",
"headers": [
{
"id": "unique Id to indetify the header column in the document",
"domainFieldKey": "header field that needs to be extracted",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"text": "text from the segment",
"pageNum": "Page number in the document",
"ocrConfidence": "OCR Confidence",
"extractionScore": "Extraction score",
}
],
"rows": [
{
"id": "unique Id to indetify the row in the table",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"cells": [
{
"id": "unique Id to indetify the cell in the row",
"geometry": {
"x1": "x1 axis of the bounding box rectangle",
"y1": "y1 axis of the bounding box rectangle",
"x2": "x2 axis of the bounding box rectangle",
"y2": "y3 axis of the bounding box rectangle"
},
"domainFieldKey": "column field that needs to be extracted",
"text": "text from the segment",
"pageNum": "Page number in the document",
"extractedDataType": "TEXT",
"ocrConfidence": "OCR Confidence",
"extractionScore": "Extraction score",
}
],
"pageNum": 1
}
]
}
]
}
}