Skip to content

Document converter

This is an automatic generated API reference of the main components of Docling.

document_converter

Classes:

DocumentConverter

DocumentConverter(
    allowed_formats: Optional[List[InputFormat]] = None,
    format_options: Optional[
        Dict[InputFormat, FormatOption]
    ] = None,
)

Methods:

Attributes:

allowed_formats instance-attribute

allowed_formats = (
    allowed_formats
    if allowed_formats is not None
    else [e for e in InputFormat]
)

format_to_options instance-attribute

format_to_options = {format: _get_default_option(format=format) if (custom_option := get(format)) is None else _sUdizccBzlDefor format in allowed_formats}

initialized_pipelines instance-attribute

initialized_pipelines: Dict[
    Type[BasePipeline], BasePipeline
] = {}

convert

convert(
    source: Union[Path, str, DocumentStream],
    headers: Optional[Dict[str, str]] = None,
    raises_on_error: bool = True,
    max_num_pages: int = maxsize,
    max_file_size: int = maxsize,
) -> ConversionResult

convert_all

convert_all(
    source: Iterable[Union[Path, str, DocumentStream]],
    headers: Optional[Dict[str, str]] = None,
    raises_on_error: bool = True,
    max_num_pages: int = maxsize,
    max_file_size: int = maxsize,
) -> Iterator[ConversionResult]

initialize_pipeline

initialize_pipeline(format: InputFormat)

Initialize the conversion pipeline for the selected format.

ConversionResult

Bases: BaseModel

Attributes:

assembled class-attribute instance-attribute

assembled: AssembledUnit = AssembledUnit()

document class-attribute instance-attribute

document: DoclingDocument = _EMPTY_DOCLING_DOC

errors class-attribute instance-attribute

errors: List[ErrorItem] = []

input instance-attribute

input: InputDocument

legacy_document property

legacy_document

pages class-attribute instance-attribute

pages: List[Page] = []

status class-attribute instance-attribute

timings class-attribute instance-attribute

timings: Dict[str, ProfilingItem] = {}

ConversionStatus

Bases: str, Enum

Attributes:

FAILURE class-attribute instance-attribute

FAILURE = 'failure'

PARTIAL_SUCCESS class-attribute instance-attribute

PARTIAL_SUCCESS = 'partial_success'

PENDING class-attribute instance-attribute

PENDING = 'pending'

SKIPPED class-attribute instance-attribute

SKIPPED = 'skipped'

STARTED class-attribute instance-attribute

STARTED = 'started'

SUCCESS class-attribute instance-attribute

SUCCESS = 'success'

FormatOption

Bases: BaseModel

Methods:

Attributes:

backend instance-attribute

backend: Type[AbstractDocumentBackend]

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls instance-attribute

pipeline_cls: Type[BasePipeline]

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

InputFormat

Bases: str, Enum

A document format supported by document backend parsers.

Attributes:

ASCIIDOC class-attribute instance-attribute

ASCIIDOC = 'asciidoc'

DOCX class-attribute instance-attribute

DOCX = 'docx'

HTML class-attribute instance-attribute

HTML = 'html'

IMAGE class-attribute instance-attribute

IMAGE = 'image'

MD class-attribute instance-attribute

MD = 'md'

PDF class-attribute instance-attribute

PDF = 'pdf'

PPTX class-attribute instance-attribute

PPTX = 'pptx'

XLSX class-attribute instance-attribute

XLSX = 'xlsx'

XML_PUBMED class-attribute instance-attribute

XML_PUBMED = 'xml_pubmed'

XML_USPTO class-attribute instance-attribute

XML_USPTO = 'xml_uspto'

PdfFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = (
    DoclingParseV2DocumentBackend
)

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = StandardPdfPipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

ImageFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = (
    DoclingParseV2DocumentBackend
)

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = StandardPdfPipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

StandardPdfPipeline

StandardPdfPipeline(pipeline_options: PdfPipelineOptions)

Bases: PaginatedPipeline

Methods:

Attributes:

artifacts_path instance-attribute

artifacts_path = download_models_hf()

build_pipe instance-attribute

build_pipe = [
    PagePreprocessingModel(
        options=PagePreprocessingOptions(
            images_scale=images_scale
        )
    ),
    ocr_model,
    LayoutModel(
        artifacts_path=artifacts_path / _layout_model_path,
        accelerator_options=accelerator_options,
    ),
    TableStructureModel(
        enabled=do_table_structure,
        artifacts_path=artifacts_path / _table_model_path,
        options=table_structure_options,
        accelerator_options=accelerator_options,
    ),
    PageAssembleModel(
        options=PageAssembleOptions(keep_images=keep_images)
    ),
]

enrichment_pipe instance-attribute

enrichment_pipe = []

glm_model instance-attribute

glm_model = GlmModel(options=GlmOptions())

pipeline_options instance-attribute

pipeline_options: PdfPipelineOptions

download_models_hf staticmethod

download_models_hf(
    local_dir: Optional[Path] = None, force: bool = False
) -> Path

execute

execute(
    in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult

get_default_options classmethod

get_default_options() -> PdfPipelineOptions

get_ocr_model

get_ocr_model() -> Optional[BaseOcrModel]

initialize_page

initialize_page(
    conv_res: ConversionResult, page: Page
) -> Page

is_backend_supported classmethod

is_backend_supported(backend: AbstractDocumentBackend)

WordFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = (
    MsWordDocumentBackend
)

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

PowerpointFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = (
    MsPowerpointDocumentBackend
)

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

MarkdownFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = (
    MarkdownDocumentBackend
)

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

AsciiDocFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = AsciiDocBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

HTMLFormatOption

Bases: FormatOption

Methods:

Attributes:

backend class-attribute instance-attribute

backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

pipeline_cls class-attribute instance-attribute

pipeline_cls: Type = SimplePipeline

pipeline_options class-attribute instance-attribute

pipeline_options: Optional[PipelineOptions] = None

set_optional_field_default

set_optional_field_default() -> FormatOption

SimplePipeline

SimplePipeline(pipeline_options: PipelineOptions)

Bases: BasePipeline

SimpleModelPipeline.

This class is used at the moment for formats / backends which produce straight DoclingDocument output.

Methods:

Attributes:

build_pipe instance-attribute

build_pipe: List[Callable] = []

enrichment_pipe instance-attribute

enrichment_pipe: List[BaseEnrichmentModel] = []

pipeline_options instance-attribute

pipeline_options = pipeline_options

execute

execute(
    in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult

get_default_options classmethod

get_default_options() -> PipelineOptions

is_backend_supported classmethod

is_backend_supported(backend: AbstractDocumentBackend)