Python API Reference

Complete API documentation for the DDEX Parser Python bindings.

Installation

pip install ddex-parser

Imports

from ddex_parser import DdexParser, ParseOptions, ParseResult, parse

Classes

DdexParser

Main parser class for Python with pandas integration support.

class DdexParser:
    def __init__(self) -> None: ...
    def parse(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult: ...
    async def parse_async(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult: ...
    def stream(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> Iterator[Dict[str, Any]]: ...
    def to_dataframe(self, xml: Union[str, bytes], schema: str = 'flat') -> 'pd.DataFrame': ...
    def detect_version(self, xml: Union[str, bytes]) -> str: ...
    def sanity_check(self, xml: Union[str, bytes]) -> Dict[str, Any]: ...

Constructor

parser = DdexParser()

Creates a new DDEX parser instance with Rust backend.

parse()

def parse(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult

Synchronously parses DDEX XML content.

Parameters:

xml: Union[str, bytes] - DDEX XML content as string or bytes
options: Optional[ParseOptions] - Parsing configuration options

Returns: ParseResult - Parsed DDEX message structure

Example:

from ddex_parser import DdexParser, ParseOptions

parser = DdexParser()
options = ParseOptions(
    include_raw_extensions=True,
    validate_references=True
)

with open('release.xml', 'r') as f:
    xml_content = f.read()

result = parser.parse(xml_content, options)
print(f"Parsed {result.release_count} releases")
print(f"Message ID: {result.message_id}")

parse_async()

async def parse_async(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult

Asynchronously parses DDEX XML content.

Parameters:

xml: Union[str, bytes] - DDEX XML content
options: Optional[ParseOptions] - Parsing configuration options

Returns: ParseResult - Parsed DDEX message structure

Example:

import asyncio
from ddex_parser import DdexParser

async def parse_multiple_files(file_paths):
    parser = DdexParser()
    tasks = []
    
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            xml_content = f.read()
        tasks.append(parser.parse_async(xml_content))
    
    results = await asyncio.gather(*tasks)
    return results

# Usage
results = asyncio.run(parse_multiple_files(['file1.xml', 'file2.xml']))

stream()

def stream(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> Iterator[Dict[str, Any]]

Creates an iterator for streaming through releases in large DDEX files.

Parameters:

xml: Union[str, bytes] - DDEX XML content
options: Optional[ParseOptions] - Parsing configuration options

Returns: Iterator[Dict[str, Any]] - Iterator yielding release dictionaries

Example:

parser = DdexParser()
with open('large_catalog.xml', 'r') as f:
    xml_content = f.read()

for release in parser.stream(xml_content):
    print(f"Processing: {release['title']}")
    print(f"Artist: {release['artist']}")
    print(f"Release ID: {release['release_id']}")

to_dataframe()

def to_dataframe(self, xml: Union[str, bytes], schema: str = 'flat') -> 'pd.DataFrame'

Converts DDEX XML directly to a pandas DataFrame for analysis.

Parameters:

xml: Union[str, bytes] - DDEX XML content
schema: str - Output schema: 'flat' (default) or 'graph'

Returns: pd.DataFrame - Structured DataFrame with DDEX data

Raises: ImportError - If pandas is not installed

Example:

import pandas as pd
from ddex_parser import DdexParser

parser = DdexParser()

# Load XML file
with open('catalog.xml', 'r') as f:
    xml_content = f.read()

# Convert to DataFrame
df = parser.to_dataframe(xml_content, schema='flat')

# Analyze the data
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Basic statistics
print(f"Total releases: {df['release_id'].nunique()}")
print(f"Total tracks: {df['sound_recording_id'].nunique()}")
print(f"Unique artists: {df['display_artist'].nunique()}")

# Top genres
genre_counts = df['genre'].value_counts()
print(f"Top genres: {genre_counts.head()}")

detect_version()

def detect_version(self, xml: Union[str, bytes]) -> str

Detects the DDEX version from XML content.

Parameters:

xml: Union[str, bytes] - DDEX XML content

Returns: str - Detected version (e.g., "4.3", "4.2", "3.8.2", "Unknown")

Example:

parser = DdexParser()
with open('unknown_version.xml', 'r') as f:
    xml_content = f.read()

version = parser.detect_version(xml_content)
print(f"Detected DDEX version: {version}")

if version == "Unknown":
    print("Warning: Could not detect DDEX version")

sanity_check()

def sanity_check(self, xml: Union[str, bytes]) -> Dict[str, Any]

Performs a quick validation check on DDEX XML without full parsing.

Parameters:

xml: Union[str, bytes] - DDEX XML content

Returns: Dict[str, Any] - Validation results with keys:

is_valid: bool - Whether the XML passes basic validation
version: str - Detected DDEX version
errors: List[str] - List of validation errors
warnings: List[str] - List of validation warnings

Example:

parser = DdexParser()
with open('suspicious_file.xml', 'r') as f:
    xml_content = f.read()

check = parser.sanity_check(xml_content)

if check['is_valid']:
    print(f"✓ Valid DDEX {check['version']} file")
else:
    print("✗ Invalid DDEX file")
    for error in check['errors']:
        print(f"  Error: {error}")
    
if check['warnings']:
    print("Warnings:")
    for warning in check['warnings']:
        print(f"  Warning: {warning}")

ParseOptions

Configuration options for parsing behavior.

class ParseOptions:
    def __init__(
        self,
        include_raw_extensions: bool = False,
        include_comments: bool = False,
        validate_references: bool = True,
        streaming: bool = False,
        timeout: float = 30.0,
    ) -> None: ...
    
    def to_dict(self) -> Dict[str, Any]: ...

Constructor Parameters

include_raw_extensions: bool (default: False)
Include raw XML for extension elements to preserve round-trip fidelity
include_comments: bool (default: False)
Include XML comments in the parsed output
validate_references: bool (default: True)
Validate that all resource references are resolvable
streaming: bool (default: False)
Enable streaming mode for large files
timeout: float (default: 30.0)
Parsing timeout in seconds

Example:

from ddex_parser import ParseOptions

# Default options
options = ParseOptions()

# Custom options for round-trip processing
roundtrip_options = ParseOptions(
    include_raw_extensions=True,
    include_comments=True,
    validate_references=True
)

# Options for large file processing
streaming_options = ParseOptions(
    streaming=True,
    validate_references=False,
    timeout=120.0
)

to_dict()

def to_dict(self) -> Dict[str, Any]

Converts options to dictionary format for internal use.

Returns: Dict[str, Any] - Dictionary representation of options

ParseResult

Result object containing parsed DDEX message data.

class ParseResult:
    def __init__(self, data: Dict[str, Any]) -> None: ...
    
    message_id: str
    version: str
    release_count: int
    releases: List[Dict[str, Any]]

Properties

message_id: str - Unique message identifier from DDEX XML
version: str - Detected DDEX version
release_count: int - Number of releases in the message
releases: List[Dict[str, Any]] - List of release data dictionaries

Example:

result = parser.parse(xml_content)

print(f"Message ID: {result.message_id}")
print(f"DDEX Version: {result.version}")
print(f"Release Count: {result.release_count}")

# Access individual releases
for release in result.releases:
    print(f"Release: {release.get('title', 'Unknown')}")
    print(f"Artist: {release.get('artist', 'Unknown')}")

Convenience Functions

parse()

def parse(xml: Union[str, bytes], **kwargs) -> ParseResult

Convenience function for quick parsing without creating a parser instance.

Parameters:

xml: Union[str, bytes] - DDEX XML content
**kwargs - Keyword arguments passed to ParseOptions

Returns: ParseResult - Parsed DDEX message structure

Example:

from ddex_parser import parse

# Quick parse with default options
result = parse(xml_content)

# Quick parse with custom options
result = parse(
    xml_content,
    include_raw_extensions=True,
    validate_references=False
)

DataFrame Integration

Schema Types

When using to_dataframe(), you can specify different output schemas:

Flat Schema (Default)

The flat schema provides a denormalized, analysis-friendly structure:

df = parser.to_dataframe(xml_content, schema='flat')

Common Columns:

message_id - Message identifier
release_id - Release identifier
sound_recording_id - Track identifier
isrc - International Standard Recording Code
title - Track or release title
display_artist - Primary artist name
label_name - Record label
release_date - Release date (ISO format)
genre - Musical genre
territory - Geographic territory
deal_type - Commercial deal type
distribution_channel - Distribution method

Graph Schema

The graph schema preserves the hierarchical DDEX structure:

df = parser.to_dataframe(xml_content, schema='graph')

This schema maintains the original XML hierarchy and is useful for round-trip processing.

DataFrame Examples

import pandas as pd
from ddex_parser import DdexParser

parser = DdexParser()

# Load and convert to DataFrame
with open('catalog.xml', 'r') as f:
    xml_content = f.read()

df = parser.to_dataframe(xml_content)

# Basic analysis
print(f"Total releases: {df['release_id'].nunique()}")
print(f"Total tracks: {df['sound_recording_id'].nunique()}")

# Genre analysis
genre_dist = df['genre'].value_counts()
print("Top genres:")
print(genre_dist.head())

# Artist analysis
artist_track_counts = df.groupby('display_artist')['sound_recording_id'].nunique()
top_artists = artist_track_counts.sort_values(ascending=False).head(10)
print("Most prolific artists:")
print(top_artists)

# Territory analysis
territory_releases = df.groupby('territory')['release_id'].nunique()
print("Releases by territory:")
print(territory_releases)

# Date analysis
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
yearly_releases = df.groupby('release_year')['release_id'].nunique()
print("Releases by year:")
print(yearly_releases)

Error Handling

The Python API raises standard Python exceptions for various error conditions:

from ddex_parser import DdexParser

parser = DdexParser()

try:
    result = parser.parse(xml_content)
except ValueError as e:
    if "Invalid XML" in str(e):
        print(f"XML parsing failed: {e}")
    elif "Unsupported version" in str(e):
        print(f"Unsupported DDEX version: {e}")
    else:
        print(f"Validation error: {e}")
except TimeoutError as e:
    print(f"Parsing timeout: {e}")
except MemoryError as e:
    print(f"Out of memory: {e}")
except ImportError as e:
    print(f"Missing dependency: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Common Exception Types

ValueError: Invalid XML, schema violations, or unsupported DDEX versions
TimeoutError: Parsing exceeded specified timeout
MemoryError: File too large for available memory
ImportError: Missing optional dependencies (e.g., pandas for to_dataframe())
FileNotFoundError: File path issues when reading from disk

Performance Optimization

Memory Management

# Use streaming for large files
parser = DdexParser()
options = ParseOptions(streaming=True, timeout=300.0)

# Process without reference validation for speed
fast_options = ParseOptions(
    validate_references=False,
    include_raw_extensions=False
)

Async Processing

import asyncio
from ddex_parser import DdexParser

async def process_catalog_files(file_paths):
    parser = DdexParser()
    semaphore = asyncio.Semaphore(5)  # Limit concurrent operations
    
    async def process_file(file_path):
        async with semaphore:
            with open(file_path, 'r') as f:
                xml_content = f.read()
            return await parser.parse_async(xml_content)
    
    tasks = [process_file(path) for path in file_paths]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # Handle results and exceptions
    successful = [r for r in results if isinstance(r, ParseResult)]
    errors = [r for r in results if isinstance(r, Exception)]
    
    return successful, errors

Batch DataFrame Processing

import pandas as pd
from pathlib import Path
from ddex_parser import DdexParser

def process_catalog_to_dataframe(directory_path):
    parser = DdexParser()
    dataframes = []
    
    xml_files = Path(directory_path).glob("*.xml")
    
    for xml_file in xml_files:
        try:
            with open(xml_file, 'r') as f:
                xml_content = f.read()
            
            df = parser.to_dataframe(xml_content)
            df['source_file'] = xml_file.name
            dataframes.append(df)
            
        except Exception as e:
            print(f"Failed to process {xml_file}: {e}")
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

# Usage
catalog_df = process_catalog_to_dataframe('./ddex_files/')
print(f"Combined catalog: {len(catalog_df)} records from {catalog_df['source_file'].nunique()} files")

CLI Integration

The Python package includes a command-line interface:

# Parse a single file
python -m ddex_parser parse input.xml

# Convert to JSON
python -m ddex_parser parse input.xml --output output.json

# Convert to CSV (requires pandas)
python -m ddex_parser to-csv input.xml output.csv

# Validate DDEX file
python -m ddex_parser validate input.xml

# Detect version
python -m ddex_parser version input.xml

Access the CLI programmatically:

from ddex_parser.cli import main
import sys

# Simulate command line arguments
sys.argv = ['ddex_parser', 'parse', 'input.xml', '--output', 'output.json']
main()

Installation​

Imports​

Classes​

DdexParser​

Constructor​

parse()​

parse_async()​

stream()​

to_dataframe()​

detect_version()​

sanity_check()​

ParseOptions​

Constructor Parameters​

to_dict()​

ParseResult​

Properties​

Convenience Functions​

parse()​

DataFrame Integration​

Schema Types​

Flat Schema (Default)​

Graph Schema​

DataFrame Examples​

Error Handling​

Common Exception Types​

Performance Optimization​

Memory Management​

Async Processing​

Batch DataFrame Processing​

CLI Integration​

Installation

Imports

Classes

DdexParser

Constructor

parse()

parse_async()

stream()

to_dataframe()

detect_version()

sanity_check()

ParseOptions

Constructor Parameters

to_dict()

ParseResult

Properties

Convenience Functions

parse()

DataFrame Integration

Schema Types

Flat Schema (Default)

Graph Schema

DataFrame Examples

Error Handling

Common Exception Types

Performance Optimization

Memory Management

Async Processing

Batch DataFrame Processing

CLI Integration