Data Processing Issues
Troubleshooting data processing, file uploads, and text processing issues
This guide helps you resolve issues related to data processing, file uploads, text extraction, and data transformation in BroxiAI workflows.
File Upload Issues
Cannot Upload Files
Problem: Files failing to upload or being rejected
Symptoms:
Upload progress bar stuck
"File upload failed" errors
Files not appearing in file manager
Timeout during upload
Solutions:
Check File Size Limits
{ "file_limits": { "max_size": "100MB", "max_files": 50, "total_storage": "5GB" } }
Individual file limit: 100MB
Total workspace storage: Varies by plan
Concurrent uploads: Maximum 5 files
Verify File Format Support
{ "supported_formats": { "documents": [".pdf", ".docx", ".doc", ".txt", ".rtf"], "spreadsheets": [".xlsx", ".xls", ".csv"], "data": [".json", ".xml", ".yaml"], "images": [".jpg", ".jpeg", ".png", ".gif"], "audio": [".mp3", ".wav", ".m4a"], "video": [".mp4", ".avi", ".mov"] } }
File Upload Configuration
// Example upload with validation function uploadFile(file) { // Validate file size if (file.size > 100 * 1024 * 1024) { throw new Error('File too large. Maximum size is 100MB.'); } // Validate file type const allowedTypes = ['.pdf', '.docx', '.txt', '.csv']; const fileExtension = file.name.toLowerCase().substr(file.name.lastIndexOf('.')); if (!allowedTypes.includes(fileExtension)) { throw new Error(`Unsupported file type: ${fileExtension}`); } // Upload file return uploadToAPI(file); }
File Processing Failures
Problem: Uploaded files not being processed correctly
Solutions:
Check File Integrity
# Verify file is not corrupted file suspicious_file.pdf # Check PDF structure pdfinfo suspicious_file.pdf # Verify file encoding file -i text_file.txt
Character Encoding Issues
import chardet def detect_and_convert_encoding(file_path): # Detect encoding with open(file_path, 'rb') as f: raw_data = f.read() result = chardet.detect(raw_data) encoding = result['encoding'] # Convert to UTF-8 if needed if encoding.lower() != 'utf-8': with open(file_path, 'r', encoding=encoding) as f: content = f.read() with open(file_path, 'w', encoding='utf-8') as f: f.write(content) return encoding
File Structure Validation
import pandas as pd def validate_csv_file(file_path): try: # Test CSV parsing df = pd.read_csv(file_path, nrows=5) return { 'valid': True, 'columns': list(df.columns), 'rows_sample': len(df), 'encoding': 'utf-8' } except Exception as e: return { 'valid': False, 'error': str(e) }
Text Processing Issues
Text Extraction Failures
Problem: Cannot extract text from documents
Solutions:
PDF Text Extraction
import PyPDF2 import fitz # PyMuPDF def extract_text_from_pdf(file_path): text = "" try: # Method 1: PyPDF2 with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() except Exception as e: print(f"PyPDF2 failed: {e}") try: # Method 2: PyMuPDF (better for complex PDFs) doc = fitz.open(file_path) for page in doc: text += page.get_text() doc.close() except Exception as e: print(f"PyMuPDF failed: {e}") return None return text.strip()
Document Processing Configuration
{ "text_extraction": { "pdf_method": "pymupdf", "ocr_enabled": true, "language": "eng", "preserve_formatting": false, "extract_images": false } }
Handle Special Characters
import unicodedata def clean_extracted_text(text): # Normalize Unicode characters text = unicodedata.normalize('NFKD', text) # Remove control characters text = ''.join(char for char in text if not unicodedata.category(char).startswith('C')) # Fix common extraction issues text = text.replace('\u00a0', ' ') # Non-breaking space text = text.replace('\u2019', "'") # Right single quotation mark text = text.replace('\u201c', '"') # Left double quotation mark text = text.replace('\u201d', '"') # Right double quotation mark return text
Text Length and Processing Limits
Problem: Text too long for processing components
Solutions:
Text Chunking Strategy
def chunk_text(text, max_chunk_size=4000, overlap=200): """Split text into overlapping chunks""" chunks = [] start = 0 while start < len(text): end = start + max_chunk_size # Try to break at sentence boundary if end < len(text): # Look for sentence ending for i in range(end, start + max_chunk_size - 200, -1): if text[i] in '.!?': end = i + 1 break chunk = text[start:end] chunks.append(chunk) # Move start with overlap start = end - overlap if start >= len(text): break return chunks
Token Counting
import tiktoken def count_tokens(text, model="gpt-4"): """Count tokens for a specific model""" encoding = tiktoken.encoding_for_model(model) return len(encoding.encode(text)) def split_by_tokens(text, max_tokens=3000, model="gpt-4"): """Split text by token count""" encoding = tiktoken.encoding_for_model(model) tokens = encoding.encode(text) chunks = [] for i in range(0, len(tokens), max_tokens): chunk_tokens = tokens[i:i + max_tokens] chunk_text = encoding.decode(chunk_tokens) chunks.append(chunk_text) return chunks
Language and Encoding Issues
Problem: Issues with non-English text or special characters
Solutions:
Multi-language Support
import langdetect def detect_and_process_language(text): try: language = langdetect.detect(text) # Language-specific processing if language == 'zh': # Chinese # Use Chinese-specific text processing return process_chinese_text(text) elif language == 'ar': # Arabic # Use Arabic-specific text processing return process_arabic_text(text) else: # Default processing return process_default_text(text) except langdetect.LangDetectError: # Fall back to default processing return process_default_text(text)
Encoding Detection and Conversion
import chardet import codecs def read_file_with_encoding_detection(file_path): # Detect encoding with open(file_path, 'rb') as f: raw_data = f.read() encoding_info = chardet.detect(raw_data) encoding = encoding_info['encoding'] confidence = encoding_info['confidence'] # Read with detected encoding if confidence > 0.8: try: with codecs.open(file_path, 'r', encoding=encoding) as f: return f.read() except UnicodeDecodeError: pass # Fallback encodings to try fallback_encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] for enc in fallback_encodings: try: with codecs.open(file_path, 'r', encoding=enc) as f: return f.read() except UnicodeDecodeError: continue raise ValueError("Could not decode file with any encoding")
Data Transformation Issues
Data Format Conversion
Problem: Cannot convert between data formats
Solutions:
CSV to JSON Conversion
import pandas as pd import json def csv_to_json(csv_file_path, json_file_path): try: # Read CSV df = pd.read_csv(csv_file_path) # Handle missing values df = df.fillna('') # Convert to JSON json_data = df.to_dict('records') # Write JSON with open(json_file_path, 'w', encoding='utf-8') as f: json.dump(json_data, f, indent=2, ensure_ascii=False) return json_data except Exception as e: print(f"Conversion failed: {e}") return None
XML Processing
import xml.etree.ElementTree as ET import json def xml_to_dict(xml_string): def elem_to_dict(elem): result = {} # Add attributes if elem.attrib: result.update(elem.attrib) # Add text content if elem.text and elem.text.strip(): if result: result['_text'] = elem.text.strip() else: return elem.text.strip() # Add children for child in elem: child_data = elem_to_dict(child) if child.tag in result: if not isinstance(result[child.tag], list): result[child.tag] = [result[child.tag]] result[child.tag].append(child_data) else: result[child.tag] = child_data return result root = ET.fromstring(xml_string) return {root.tag: elem_to_dict(root)}
Data Validation Issues
Problem: Data not passing validation checks
Solutions:
Schema Validation
import jsonschema def validate_data_schema(data, schema): try: jsonschema.validate(instance=data, schema=schema) return {'valid': True} except jsonschema.ValidationError as e: return { 'valid': False, 'error': e.message, 'path': list(e.path), 'failed_value': e.instance } # Example schema user_schema = { "type": "object", "properties": { "name": {"type": "string", "minLength": 1}, "email": {"type": "string", "format": "email"}, "age": {"type": "integer", "minimum": 0} }, "required": ["name", "email"] }
Data Cleaning
import pandas as pd import numpy as np def clean_dataframe(df): # Remove completely empty rows df = df.dropna(how='all') # Remove completely empty columns df = df.dropna(axis=1, how='all') # Clean string columns string_columns = df.select_dtypes(include=['object']).columns for col in string_columns: # Strip whitespace df[col] = df[col].astype(str).str.strip() # Replace empty strings with NaN df[col] = df[col].replace('', np.nan) # Handle numeric columns numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns for col in numeric_columns: # Remove outliers (optional) Q1 = df[col].quantile(0.25) Q3 = df[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR # Flag outliers (don't remove automatically) df[f'{col}_outlier'] = (df[col] < lower_bound) | (df[col] > upper_bound) return df
Memory and Performance Issues
Large File Processing
Problem: Running out of memory when processing large files
Solutions:
Streaming Processing
def process_large_csv_streaming(file_path, chunk_size=10000): results = [] # Process in chunks for chunk in pd.read_csv(file_path, chunksize=chunk_size): # Process each chunk processed_chunk = process_data_chunk(chunk) results.append(processed_chunk) # Clear memory del chunk # Combine results return pd.concat(results, ignore_index=True) def process_data_chunk(chunk): # Your processing logic here return chunk.apply(some_transformation)
Memory-Efficient Text Processing
def process_large_text_file(file_path, process_function): with open(file_path, 'r', encoding='utf-8') as file: buffer = "" results = [] for line in file: buffer += line # Process when buffer reaches certain size if len(buffer) > 10000: # 10KB chunks result = process_function(buffer) results.append(result) buffer = "" # Process remaining buffer if buffer: result = process_function(buffer) results.append(result) return results
Processing Performance Optimization
Problem: Data processing taking too long
Solutions:
Parallel Processing
import multiprocessing as mp from concurrent.futures import ProcessPoolExecutor def parallel_process_data(data_chunks, process_function, max_workers=None): if max_workers is None: max_workers = mp.cpu_count() - 1 with ProcessPoolExecutor(max_workers=max_workers) as executor: results = list(executor.map(process_function, data_chunks)) return results # Usage chunks = np.array_split(large_dataframe, 4) # Split into 4 chunks results = parallel_process_data(chunks, your_process_function) final_result = pd.concat(results)
Caching Expensive Operations
from functools import lru_cache import hashlib @lru_cache(maxsize=1000) def expensive_text_processing(text_hash): # Only cache based on hash to save memory return complex_nlp_processing(text_hash) def process_with_cache(text): # Create hash of input text_hash = hashlib.md5(text.encode()).hexdigest() # Check if we've processed this exact text before return expensive_text_processing(text_hash)
Error Handling and Recovery
Graceful Error Handling
import logging
from typing import Optional, Dict, Any
def robust_data_processor(data: Any) -> Optional[Dict[str, Any]]:
"""Process data with comprehensive error handling"""
try:
# Validate input
if not data:
raise ValueError("Empty data provided")
# Process data
result = process_data(data)
# Validate output
if not validate_output(result):
raise ValueError("Output validation failed")
return {
'success': True,
'data': result,
'errors': []
}
except ValueError as e:
logging.error(f"Validation error: {e}")
return {
'success': False,
'data': None,
'errors': [{'type': 'validation', 'message': str(e)}]
}
except Exception as e:
logging.error(f"Processing error: {e}")
return {
'success': False,
'data': None,
'errors': [{'type': 'processing', 'message': str(e)}]
}
Recovery Strategies
def process_with_fallback(data, primary_processor, fallback_processor):
"""Try primary processor, fall back to secondary if it fails"""
try:
return primary_processor(data)
except Exception as e:
logging.warning(f"Primary processor failed: {e}. Trying fallback.")
try:
return fallback_processor(data)
except Exception as e2:
logging.error(f"Fallback processor also failed: {e2}")
raise ProcessingError("All processors failed") from e2
Monitoring and Debugging
Data Processing Metrics
import time
from contextlib import contextmanager
@contextmanager
def processing_timer(operation_name):
start_time = time.time()
try:
yield
finally:
duration = time.time() - start_time
logging.info(f"{operation_name} took {duration:.2f} seconds")
# Usage
with processing_timer("CSV Processing"):
result = process_csv_file(file_path)
Data Quality Checks
def data_quality_report(df):
"""Generate data quality report"""
report = {
'total_rows': len(df),
'total_columns': len(df.columns),
'missing_data': {},
'data_types': {},
'duplicates': df.duplicated().sum()
}
for column in df.columns:
missing_count = df[column].isnull().sum()
missing_percentage = (missing_count / len(df)) * 100
report['missing_data'][column] = {
'count': missing_count,
'percentage': missing_percentage
}
report['data_types'][column] = str(df[column].dtype)
return report
Getting Help
For data processing issues:
Check File Format: Ensure files are in supported formats
Validate Data: Use schema validation tools
Monitor Resources: Check memory and CPU usage
Test with Samples: Process small samples first
Support Information to Include:
File formats and sizes
Processing error messages
Data samples (anonymized)
System resource usage
Processing configuration
Last updated