Source code for qufe.texthandler

"""
Text processing utilities for string manipulation, formatting, and analysis.

This module provides functions for:
- Converting lists to DokuWiki table format
- Finding string occurrences with context
- Pretty-printing nested dictionaries
- Extracting substrings between delimiters
- Displaying items in column format
- Extracts price from a string and returns it as float
"""

from itertools import zip_longest
from collections.abc import Iterable
from typing import List, Dict, Any, Union
import re


[docs] def list_to_doku_wiki_table(data: List[List[str]]) -> None: """ Convert a 2D list to DokuWiki table format and print it. The first row is treated as headers (with ^ delimiters), subsequent rows are treated as data (with | delimiters). Args: data: 2D list where first row contains headers Example: >>> data = [['Name', 'Age'], ['Alice', '25'], ['Bob', '30']] >>> list_to_doku_wiki_table(data) ^ Name ^ Age ^ | Alice | 25 | | Bob | 30 | """ if not data or not data[0]: return print(f'^ {" ^ ".join(data[0])} ^') for line in data[1:]: print(f'| {" | ".join(line)} |')
[docs] def find_all_occurrences(input_string: str, str_to_find: str, print_len: bool = True) -> List[int]: """ Find all starting positions of a substring in a string. Args: input_string: The string to search in str_to_find: The substring to find print_len: Whether to print the number of occurrences found Returns: List of starting positions where the substring was found Example: >>> find_all_occurrences("hello world hello", "hello") occurrences found: 2 [0, 12] """ start_index = 0 positions = [] while start_index != -1: start_index = input_string.find(str_to_find, start_index) if start_index != -1: positions.append(start_index) start_index += 1 if print_len: print(f'occurrences found: {len(positions)}') return positions
[docs] def substring_between( input_string: str, start_string: str, end: Union[str, int], start_offset: int = 0) -> List[str]: """ Extract substrings between start and end markers. Args: input_string: The string to extract from start_string: The starting delimiter end: The ending delimiter (string) or length (int) start_offset: Offset to apply before the start position Returns: List of extracted substrings Example: >>> substring_between("start{content}end start{more}end", "start{", "}", 0) ['content}', 'more}'] """ result = [] start_indexes = find_all_occurrences(input_string, start_string, print_len=False) start_offset = abs(start_offset) for start_index in start_indexes: adjusted_start = max(start_index - start_offset, 0) substring = input_string[adjusted_start:] if isinstance(end, str): end_index = substring.find(end) if end_index != -1: result.append(substring[:end_index + len(end)]) elif isinstance(end, int): result.append(substring[:end]) return result
[docs] def extract_number(text: str, strict: bool = True) -> float: """ Extracts a number from text and returns it as float. Gracefully handles various text formats containing numbers, including those with commas, currency symbols, and other characters. Args: text: String containing numeric information strict: If True, raises ValueError when no number is found. If False, returns 0.0 when no number is found (default: True) Returns: float: Extracted number Raises: ValueError: When strict=True and no number is found, or when multiple decimal points exist Examples: >>> extract_number("1,234.56원") 1234.56 >>> extract_number("₩ 1_234_567") 1234567.0 >>> extract_number("2,500 items") 2500.0 >>> extract_number("Score: 98.5%") 98.5 >>> extract_number("text only", strict=False) 0.0 >>> extract_number("text only", strict=True) ValueError: No number found Test Examples: # strict=True (default) tests test_cases = [ ("1,234.56원", 1234.56), ("₩ 2,500", 2500.0), ("1_000_000", 1000000.0), ("3,456.78 (including tax)", 3456.78), ("USD 99.99 [discounted]", 99.99), ("Score: 85.5", 85.5), ("Temperature: -12.3°C", 12.3), # Note: minus sign not preserved ] # strict=False tests lenient_cases = [ ("1,234.56원", 1234.56), ("no number here", 0.0), ("text only", 0.0), ("", 0.0), (None, 0.0), ("100 items", 100.0), ("free (0원)", 0.0), ] # Always error cases (regardless of strict mode) error_cases = [ "1.234.56", # multiple decimal points ] """ # Input validation - fail fast if not text or not isinstance(text, str): if strict: raise ValueError(f"Invalid input: {text}") return 0.0 # Extract content before brackets for cleaner number extraction brackets = ['(', '[', '{'] for bracket in brackets: if bracket in text: text = text.split(bracket)[0] # Remove common separators (thousands separators, spaces) cleaned = text.replace(',', '').replace(' ', '').replace('_', '') # Extract numeric patterns (digits and decimal points) number_pattern = re.findall(r'[\d.]+', cleaned) if not number_pattern: if strict: raise ValueError(f"No number found: '{text}'") return 0.0 # Use first found number fragment combined = number_pattern[0] # Validate decimal point count decimal_count = combined.count('.') if decimal_count > 1: raise ValueError(f"Multiple decimal points found: '{text}' -> '{combined}'") # Check for edge cases (empty or just decimal point) if combined in ('.', ''): if strict: raise ValueError(f"No valid number found: '{text}'") return 0.0 # Convert to float with error handling try: return float(combined) except ValueError as e: if strict: raise ValueError(f"Number conversion failed: '{text}' -> '{combined}', error: {e}") return 0.0