Python Content & Segments Coding - Best Practices

Overview

This guide covers advanced patterns and best practices for working with content and segments in HyperFlow Python transforms. It includes techniques for efficient processing, metadata preservation, and integration with HyperFlow's content management system.

Understanding Content vs Segments

Content

What: Any mimetyped content data (uploaded files, LLM-generated media, scraped data, transformed content)
Access: MIMETYPE + URL pairs providing HTTP access to content data
Use cases: Document processing, media handling, rich content generation, multi-modal workflows

Segments

What: Text chunks with metadata (sentences, paragraphs, semantic units)
Access: Text + metadata structures with optional positioning information
Use cases: Text analysis, RAG systems, semantic search, content segmentation

Content Processing Patterns

Extracting and Transforming Content

# Process multiple content items with HTTP access
contents = input1  # Content input type

processed_contents = []
for content in contents:
    # Access content data
    mimetype = content["mimetype"]
    url = content["url"]
    text = content.get("text", "")  # Extracted text if available
    
    # Handle different content types
    if mimetype.startswith("image/"):
        # Download and process images
        response = requests.get(url)
        image_data = response.content
        # Process image and create summary
        summary = f"Image analysis: {len(image_data)} bytes, type: {mimetype}"
    elif text:
        # Use extracted text for text-based content
        lines = text.split('\\n')
        summary = '\\n'.join(lines[:5])  # First 5 lines
    else:
        summary = f"Content type: {mimetype}, size: available via URL"
    
    # Create new content with summary
    processed_contents.append({
        "dataType": "text",
        "mimetype": "text/plain",
        "data": f"SUMMARY:\\n{summary}"
    })

# Output as new content
output = processed_contents

Content Type Detection and Routing

# Route content based on mimetype and process via HTTP access
contents = input1

text_contents = []
json_contents = []
other_contents = []

for content in contents:
    mimetype = content["mimetype"]
    
    if mimetype.startswith("text/"):
        text_contents.append(content)
    elif mimetype == "application/json":
        json_contents.append(content)
    else:
        other_contents.append(content)

# Process each type differently
processed = []

# Process text files
for tc in text_contents:
    # Access content data via URL if needed
    text = tc.get("text", "")
    if not text and tc.get("url"):
        response = requests.get(tc["url"])
        text = response.text
    
    processed.append({
        "dataType": "text",
        "mimetype": "text/plain",
        "data": text.upper()  # Example transformation
    })

# Process JSON files
for jc in json_contents:
    # Access content data via URL if needed
    text = jc.get("text", "")
    if not text and jc.get("url"):
        response = requests.get(jc["url"])
        text = response.text
    
    data = json.loads(text)
    # Transform JSON data
    data["processed"] = True
    processed.append({
        "dataType": "text",
        "mimetype": "application/json",
        "data": json.dumps(data, indent=2)
    })

output = processed

Merging Multiple Contents

# Merge multiple documents into one with HTTP access
contents = input1

# Group by mimetype and access content data
grouped = {}
for content in contents:
    mimetype = content["mimetype"]
    if mimetype not in grouped:
        grouped[mimetype] = []
    
    # Access content data via HTTP if needed
    text = content.get("text", "")
    if not text and content.get("url"):
        response = requests.get(content["url"])
        text = response.text
    
    grouped[mimetype].append(text)

# Create merged documents
merged_contents = []
for mimetype, texts in grouped.items():
    if mimetype.startswith("text/"):
        separator = "\\n\\n" + "="*50 + "\\n\\n"
        merged_text = separator.join(texts)
        
        merged_contents.append({
            "dataType": "text",
            "mimetype": mimetype,
            "data": merged_text
        })

output = merged_contents

Content with Rich Metadata

# Generate content with detailed metadata
import datetime

# Process input and generate analysis
analysis_result = analyze_text(input1)

# Create content with metadata
output = {
    "dataType": "text",
    "mimetype": "application/json",
    "data": json.dumps(analysis_result, indent=2),
    "metadata": [
        {"key": "analysis_type", "value": "sentiment"},
        {"key": "processed_date", "value": datetime.now().isoformat()},
        {"key": "confidence", "value": str(analysis_result.get("confidence", 0))},
        {"key": "version", "value": "1.0"}
    ]
}