Source code for datastudio.datasets.formatters.jsonl_formatter

"""JSONL (JSON Lines) data format handler."""

import json
import logging
import os
import tempfile

from .base import BaseFormat

logger = logging.getLogger(__name__)


[docs] class JsonlFormat(BaseFormat): """JSONL (JSON Lines) data format handler. Each line contains one JSON object. Invalid lines are skipped with a warning. """
[docs] @classmethod def extensions(cls) -> list: """Return supported file extensions. Returns: list: ['.jsonl'] """ return [".jsonl"]
[docs] def load(self, file_path: str) -> list: """Load data from JSONL file. Args: file_path: Path to JSONL file. Returns: list: List of data dictionaries. Note: Invalid JSON lines are skipped with a warning message. """ data = [] with open(file_path, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if line: try: data.append(json.loads(line)) except json.JSONDecodeError as e: logger.warning( "Skipping invalid JSON at line %d in %s: %s", line_num, file_path, e, ) return data
[docs] def save(self, data: list, file_path: str, ensure_ascii: bool = False) -> None: """Save data to JSONL file. Args: data: List of data dictionaries. file_path: Output file path. ensure_ascii: Whether to escape non-ASCII characters (default: False). """ dir_path = os.path.dirname(file_path) or "." os.makedirs(dir_path, exist_ok=True) # Write to temp file first, then atomic rename to avoid data loss on disk full fd, temp_path = tempfile.mkstemp(dir=dir_path, suffix=".tmp") try: with os.fdopen(fd, "w", encoding="utf-8", errors="surrogatepass") as f: for item in data: f.write(json.dumps(item, ensure_ascii=ensure_ascii) + "\n") f.flush() os.fsync(f.fileno()) os.replace(temp_path, file_path) except BaseException: # Clean up temp file on failure, preserve original file try: os.unlink(temp_path) except OSError: pass raise