Python API Reference
Complete API documentation for the DDEX Parser Python bindings.
Installation
pip install ddex-parser
Imports
from ddex_parser import DDEXParser, ParseOptions, ParseResult, parse
Classes
DDEXParser
Main parser class for Python with pandas integration support.
class DDEXParser:
def __init__(self) -> None: ...
def parse(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult: ...
async def parse_async(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult: ...
def stream(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> Iterator[Dict[str, Any]]: ...
def to_dataframe(self, xml: Union[str, bytes], schema: str = 'flat') -> 'pd.DataFrame': ...
def detect_version(self, xml: Union[str, bytes]) -> str: ...
def sanity_check(self, xml: Union[str, bytes]) -> Dict[str, Any]: ...
Constructor
parser = DDEXParser()
Creates a new DDEX parser instance with Rust backend.
parse()
def parse(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult
Synchronously parses DDEX XML content.
Parameters:
xml: Union[str, bytes]
- DDEX XML content as string or bytesoptions: Optional[ParseOptions]
- Parsing configuration options
Returns: ParseResult
- Parsed DDEX message structure
Example:
from ddex_parser import DDEXParser, ParseOptions
parser = DDEXParser()
options = ParseOptions(
include_raw_extensions=True,
validate_references=True
)
with open('release.xml', 'r') as f:
xml_content = f.read()
result = parser.parse(xml_content, options)
print(f"Parsed {result.release_count} releases")
print(f"Message ID: {result.message_id}")
parse_async()
async def parse_async(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> ParseResult
Asynchronously parses DDEX XML content.
Parameters:
xml: Union[str, bytes]
- DDEX XML contentoptions: Optional[ParseOptions]
- Parsing configuration options
Returns: ParseResult
- Parsed DDEX message structure
Example:
import asyncio
from ddex_parser import DDEXParser
async def parse_multiple_files(file_paths):
parser = DDEXParser()
tasks = []
for file_path in file_paths:
with open(file_path, 'r') as f:
xml_content = f.read()
tasks.append(parser.parse_async(xml_content))
results = await asyncio.gather(*tasks)
return results
# Usage
results = asyncio.run(parse_multiple_files(['file1.xml', 'file2.xml']))
stream()
def stream(self, xml: Union[str, bytes], options: Optional[ParseOptions] = None) -> Iterator[Dict[str, Any]]
Creates an iterator for streaming through releases in large DDEX files.
Parameters:
xml: Union[str, bytes]
- DDEX XML contentoptions: Optional[ParseOptions]
- Parsing configuration options
Returns: Iterator[Dict[str, Any]]
- Iterator yielding release dictionaries
Example:
parser = DDEXParser()
with open('large_catalog.xml', 'r') as f:
xml_content = f.read()
for release in parser.stream(xml_content):
print(f"Processing: {release['title']}")
print(f"Artist: {release['artist']}")
print(f"Release ID: {release['release_id']}")
to_dataframe()
def to_dataframe(self, xml: Union[str, bytes], schema: str = 'flat') -> 'pd.DataFrame'
Converts DDEX XML directly to a pandas DataFrame for analysis.
Parameters:
xml: Union[str, bytes]
- DDEX XML contentschema: str
- Output schema: 'flat' (default) or 'graph'
Returns: pd.DataFrame
- Structured DataFrame with DDEX data
Raises: ImportError
- If pandas is not installed
Example:
import pandas as pd
from ddex_parser import DDEXParser
parser = DDEXParser()
# Load XML file
with open('catalog.xml', 'r') as f:
xml_content = f.read()
# Convert to DataFrame
df = parser.to_dataframe(xml_content, schema='flat')
# Analyze the data
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
# Basic statistics
print(f"Total releases: {df['release_id'].nunique()}")
print(f"Total tracks: {df['sound_recording_id'].nunique()}")
print(f"Unique artists: {df['display_artist'].nunique()}")
# Top genres
genre_counts = df['genre'].value_counts()
print(f"Top genres: {genre_counts.head()}")
detect_version()
def detect_version(self, xml: Union[str, bytes]) -> str
Detects the DDEX version from XML content.
Parameters:
xml: Union[str, bytes]
- DDEX XML content
Returns: str
- Detected version (e.g., "4.3", "4.2", "3.8.2", "Unknown")
Example:
parser = DDEXParser()
with open('unknown_version.xml', 'r') as f:
xml_content = f.read()
version = parser.detect_version(xml_content)
print(f"Detected DDEX version: {version}")
if version == "Unknown":
print("Warning: Could not detect DDEX version")
sanity_check()
def sanity_check(self, xml: Union[str, bytes]) -> Dict[str, Any]
Performs a quick validation check on DDEX XML without full parsing.
Parameters:
xml: Union[str, bytes]
- DDEX XML content
Returns: Dict[str, Any]
- Validation results with keys:
is_valid: bool
- Whether the XML passes basic validationversion: str
- Detected DDEX versionerrors: List[str]
- List of validation errorswarnings: List[str]
- List of validation warnings
Example:
parser = DDEXParser()
with open('suspicious_file.xml', 'r') as f:
xml_content = f.read()
check = parser.sanity_check(xml_content)
if check['is_valid']:
print(f"✓ Valid DDEX {check['version']} file")
else:
print("✗ Invalid DDEX file")
for error in check['errors']:
print(f" Error: {error}")
if check['warnings']:
print("Warnings:")
for warning in check['warnings']:
print(f" Warning: {warning}")
ParseOptions
Configuration options for parsing behavior.
class ParseOptions:
def __init__(
self,
include_raw_extensions: bool = False,
include_comments: bool = False,
validate_references: bool = True,
streaming: bool = False,
timeout: float = 30.0,
) -> None: ...
def to_dict(self) -> Dict[str, Any]: ...
Constructor Parameters
-
include_raw_extensions: bool
(default:False
)
Include raw XML for extension elements to preserve round-trip fidelity -
include_comments: bool
(default:False
)
Include XML comments in the parsed output -
validate_references: bool
(default:True
)
Validate that all resource references are resolvable -
streaming: bool
(default:False
)
Enable streaming mode for large files -
timeout: float
(default:30.0
)
Parsing timeout in seconds
Example:
from ddex_parser import ParseOptions
# Default options
options = ParseOptions()
# Custom options for round-trip processing
roundtrip_options = ParseOptions(
include_raw_extensions=True,
include_comments=True,
validate_references=True
)
# Options for large file processing
streaming_options = ParseOptions(
streaming=True,
validate_references=False,
timeout=120.0
)
to_dict()
def to_dict(self) -> Dict[str, Any]
Converts options to dictionary format for internal use.
Returns: Dict[str, Any]
- Dictionary representation of options
ParseResult
Result object containing parsed DDEX message data.
class ParseResult:
def __init__(self, data: Dict[str, Any]) -> None: ...
message_id: str
version: str
release_count: int
releases: List[Dict[str, Any]]
Properties
message_id: str
- Unique message identifier from DDEX XMLversion: str
- Detected DDEX versionrelease_count: int
- Number of releases in the messagereleases: List[Dict[str, Any]]
- List of release data dictionaries
Example:
result = parser.parse(xml_content)
print(f"Message ID: {result.message_id}")
print(f"DDEX Version: {result.version}")
print(f"Release Count: {result.release_count}")
# Access individual releases
for release in result.releases:
print(f"Release: {release.get('title', 'Unknown')}")
print(f"Artist: {release.get('artist', 'Unknown')}")
Convenience Functions
parse()
def parse(xml: Union[str, bytes], **kwargs) -> ParseResult
Convenience function for quick parsing without creating a parser instance.
Parameters:
xml: Union[str, bytes]
- DDEX XML content**kwargs
- Keyword arguments passed toParseOptions
Returns: ParseResult
- Parsed DDEX message structure
Example:
from ddex_parser import parse
# Quick parse with default options
result = parse(xml_content)
# Quick parse with custom options
result = parse(
xml_content,
include_raw_extensions=True,
validate_references=False
)
DataFrame Integration
Schema Types
When using to_dataframe()
, you can specify different output schemas:
Flat Schema (Default)
The flat schema provides a denormalized, analysis-friendly structure:
df = parser.to_dataframe(xml_content, schema='flat')
Common Columns:
message_id
- Message identifierrelease_id
- Release identifiersound_recording_id
- Track identifierisrc
- International Standard Recording Codetitle
- Track or release titledisplay_artist
- Primary artist namelabel_name
- Record labelrelease_date
- Release date (ISO format)genre
- Musical genreterritory
- Geographic territorydeal_type
- Commercial deal typedistribution_channel
- Distribution method
Graph Schema
The graph schema preserves the hierarchical DDEX structure:
df = parser.to_dataframe(xml_content, schema='graph')
This schema maintains the original XML hierarchy and is useful for round-trip processing.
DataFrame Examples
import pandas as pd
from ddex_parser import DDEXParser
parser = DDEXParser()
# Load and convert to DataFrame
with open('catalog.xml', 'r') as f:
xml_content = f.read()
df = parser.to_dataframe(xml_content)
# Basic analysis
print(f"Total releases: {df['release_id'].nunique()}")
print(f"Total tracks: {df['sound_recording_id'].nunique()}")
# Genre analysis
genre_dist = df['genre'].value_counts()
print("Top genres:")
print(genre_dist.head())
# Artist analysis
artist_track_counts = df.groupby('display_artist')['sound_recording_id'].nunique()
top_artists = artist_track_counts.sort_values(ascending=False).head(10)
print("Most prolific artists:")
print(top_artists)
# Territory analysis
territory_releases = df.groupby('territory')['release_id'].nunique()
print("Releases by territory:")
print(territory_releases)
# Date analysis
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
yearly_releases = df.groupby('release_year')['release_id'].nunique()
print("Releases by year:")
print(yearly_releases)
Error Handling
The Python API raises standard Python exceptions for various error conditions:
from ddex_parser import DDEXParser
parser = DDEXParser()
try:
result = parser.parse(xml_content)
except ValueError as e:
if "Invalid XML" in str(e):
print(f"XML parsing failed: {e}")
elif "Unsupported version" in str(e):
print(f"Unsupported DDEX version: {e}")
else:
print(f"Validation error: {e}")
except TimeoutError as e:
print(f"Parsing timeout: {e}")
except MemoryError as e:
print(f"Out of memory: {e}")
except ImportError as e:
print(f"Missing dependency: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
Common Exception Types
ValueError
: Invalid XML, schema violations, or unsupported DDEX versionsTimeoutError
: Parsing exceeded specified timeoutMemoryError
: File too large for available memoryImportError
: Missing optional dependencies (e.g., pandas forto_dataframe()
)FileNotFoundError
: File path issues when reading from disk
Performance Optimization
Memory Management
# Use streaming for large files
parser = DDEXParser()
options = ParseOptions(streaming=True, timeout=300.0)
# Process without reference validation for speed
fast_options = ParseOptions(
validate_references=False,
include_raw_extensions=False
)
Async Processing
import asyncio
from ddex_parser import DDEXParser
async def process_catalog_files(file_paths):
parser = DDEXParser()
semaphore = asyncio.Semaphore(5) # Limit concurrent operations
async def process_file(file_path):
async with semaphore:
with open(file_path, 'r') as f:
xml_content = f.read()
return await parser.parse_async(xml_content)
tasks = [process_file(path) for path in file_paths]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle results and exceptions
successful = [r for r in results if isinstance(r, ParseResult)]
errors = [r for r in results if isinstance(r, Exception)]
return successful, errors
Batch DataFrame Processing
import pandas as pd
from pathlib import Path
from ddex_parser import DDEXParser
def process_catalog_to_dataframe(directory_path):
parser = DDEXParser()
dataframes = []
xml_files = Path(directory_path).glob("*.xml")
for xml_file in xml_files:
try:
with open(xml_file, 'r') as f:
xml_content = f.read()
df = parser.to_dataframe(xml_content)
df['source_file'] = xml_file.name
dataframes.append(df)
except Exception as e:
print(f"Failed to process {xml_file}: {e}")
if dataframes:
combined_df = pd.concat(dataframes, ignore_index=True)
return combined_df
else:
return pd.DataFrame()
# Usage
catalog_df = process_catalog_to_dataframe('./ddex_files/')
print(f"Combined catalog: {len(catalog_df)} records from {catalog_df['source_file'].nunique()} files")
CLI Integration
The Python package includes a command-line interface:
# Parse a single file
python -m ddex_parser parse input.xml
# Convert to JSON
python -m ddex_parser parse input.xml --output output.json
# Convert to CSV (requires pandas)
python -m ddex_parser to-csv input.xml output.csv
# Validate DDEX file
python -m ddex_parser validate input.xml
# Detect version
python -m ddex_parser version input.xml
Access the CLI programmatically:
from ddex_parser.cli import main
import sys
# Simulate command line arguments
sys.argv = ['ddex_parser', 'parse', 'input.xml', '--output', 'output.json']
main()