From f18ca4de37a2f7a423788a968efc7c510880abfe Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Fri, 16 May 2025 19:26:06 -0500 Subject: [PATCH 01/25] Initial rework of EPG processesing. --- apps/epg/tasks.py | 643 +++++++++++++++++++++++++--------------- core/utils.py | 35 +++ dispatcharr/settings.py | 28 ++ requirements.txt | 2 + 4 files changed, 464 insertions(+), 244 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index f102630f..395f49cb 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -5,9 +5,11 @@ import gzip import os import uuid import requests -import xml.etree.ElementTree as ET import time # Add import for tracking download progress from datetime import datetime, timedelta, timezone as dt_timezone +import gc # Add garbage collection module +import json +from lxml import etree # Using lxml exclusively from celery import shared_task from django.conf import settings @@ -40,13 +42,16 @@ def send_epg_update(source_id, action, progress, **kwargs): # Now, send the updated data dictionary channel_layer = get_channel_layer() - async_to_sync(channel_layer.group_send)( - 'updates', - { - 'type': 'update', - 'data': data - } - ) + try: + async_to_sync(channel_layer.group_send)( + 'updates', + { + 'type': 'update', + 'data': data + } + ) + except Exception as e: + logger.warning(f"Failed to send WebSocket update: {e}") def delete_epg_refresh_task_by_id(epg_id): @@ -206,8 +211,12 @@ def fetch_xmltv(source): send_epg_update(source.id, "downloading", 100, status="error", error="No URL provided and no valid local file exists") return False + # Clean up existing cache file if os.path.exists(source.get_cache_file()): - os.remove(source.get_cache_file()) + try: + os.remove(source.get_cache_file()) + except Exception as e: + logger.warning(f"Failed to remove existing cache file: {e}") logger.info(f"Fetching XMLTV data from source: {source.name}") try: @@ -235,7 +244,7 @@ def fetch_xmltv(source): send_epg_update(source.id, "downloading", 0) # Use streaming response to track download progress - with requests.get(source.url, headers=headers, stream=True, timeout=30) as response: + with requests.get(source.url, headers=headers, stream=True, timeout=60) as response: # Handle 404 specifically if response.status_code == 404: logger.error(f"EPG URL not found (404): {source.url}") @@ -304,9 +313,10 @@ def fetch_xmltv(source): downloaded = 0 start_time = time.time() last_update_time = start_time + update_interval = 0.5 # Only update every 0.5 seconds with open(cache_file, 'wb') as f: - for chunk in response.iter_content(chunk_size=8192): + for chunk in response.iter_content(chunk_size=16384): # Increased chunk size for better performance if chunk: f.write(chunk) @@ -326,17 +336,18 @@ def fetch_xmltv(source): # Time remaining (in seconds) time_remaining = (total_size - downloaded) / (speed * 1024) if speed > 0 and total_size > 0 else 0 - # Only send updates every 0.5 seconds to avoid flooding + # Only send updates at specified intervals to avoid flooding current_time = time.time() - if current_time - last_update_time >= 0.5 and progress > 0: + if current_time - last_update_time >= update_interval and progress > 0: last_update_time = current_time send_epg_update( source.id, "downloading", progress, - speed=speed, - elapsed_time=elapsed_time, - time_remaining=time_remaining + speed=round(speed, 2), + elapsed_time=round(elapsed_time, 1), + time_remaining=round(time_remaining, 1), + downloaded=f"{downloaded / (1024 * 1024):.2f} MB" ) # Send completion notification @@ -424,6 +435,20 @@ def fetch_xmltv(source): ) send_epg_update(source.id, "downloading", 100, status="error", error=user_message) return False + except requests.exceptions.Timeout as e: + # Handle timeout errors specifically + error_message = str(e) + user_message = f"Timeout error: EPG source '{source.name}' took too long to respond" + logger.error(f"Timeout error fetching XMLTV from {source.name}: {e}", exc_info=True) + + # Update source status + source.status = 'error' + source.last_message = user_message + source.save(update_fields=['status', 'last_message']) + + # Send notifications + send_epg_update(source.id, "downloading", 100, status="error", error=user_message) + return False except Exception as e: error_message = str(e) logger.error(f"Error fetching XMLTV from {source.name}: {e}", exc_info=True) @@ -496,65 +521,133 @@ def parse_channels_only(source): logger.info(f"Parsing channels from EPG file: {file_path}") existing_epgs = {e.tvg_id: e for e in EPGData.objects.filter(epg_source=source)} - # Read entire file (decompress if .gz) - if file_path.endswith('.gz'): - with open(file_path, 'rb') as gz_file: - decompressed = gzip.decompress(gz_file.read()) - xml_data = decompressed.decode('utf-8') - else: - with open(file_path, 'r', encoding='utf-8') as xml_file: - xml_data = xml_file.read() + # Update progress to show file read starting + send_epg_update(source.id, "parsing_channels", 10) - # Update progress to show file read completed - send_epg_update(source.id, "parsing_channels", 25) - - root = ET.fromstring(xml_data) - channels = root.findall('channel') + # Stream parsing instead of loading entire file at once + is_gzipped = file_path.endswith('.gz') epgs_to_create = [] epgs_to_update = [] + total_channels = 0 + processed_channels = 0 + batch_size = 500 # Process in batches to limit memory usage - logger.info(f"Found {len(channels)} entries in {file_path}") + try: + # Create a parser with the desired options + parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) - # Update progress to show parsing started - send_epg_update(source.id, "parsing_channels", 50) + # Count channels for progress reporting - use proper lxml approach + # Open the file first + source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') - total_channels = len(channels) - for i, channel_elem in enumerate(channels): - tvg_id = channel_elem.get('id', '').strip() - if not tvg_id: - continue # skip blank/invalid IDs + # Create an iterparse context without parser parameter + channel_finder = etree.iterparse(source_file, events=('end',), tag='channel') - display_name = channel_elem.findtext('display-name', default=tvg_id).strip() + # Count channels + total_channels = sum(1 for _ in channel_finder) - if tvg_id in existing_epgs: - epg_obj = existing_epgs[tvg_id] - if epg_obj.name != display_name: - epg_obj.name = display_name - epgs_to_update.append(epg_obj) - else: - epgs_to_create.append(EPGData( - tvg_id=tvg_id, - name=display_name, - epg_source=source, - )) + # Close the file to reset position + source_file.close() - # Send occasional progress updates - if i % 100 == 0 or i == total_channels - 1: - progress = 50 + int((i / total_channels) * 40) # Scale to 50-90% range - send_epg_update(source.id, "parsing_channels", progress) + # Update progress after counting + send_epg_update(source.id, "parsing_channels", 25, total_channels=total_channels) - # Update progress before database operations - send_epg_update(source.id, "parsing_channels", 90) + # Reset file position for actual processing + source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') + channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') + for _, elem in channel_parser: + tvg_id = elem.get('id', '').strip() + if tvg_id: + display_name = None + for child in elem: + if child.tag == 'display-name' and child.text: + display_name = child.text.strip() + break + + if not display_name: + display_name = tvg_id + + if tvg_id in existing_epgs: + epg_obj = existing_epgs[tvg_id] + if epg_obj.name != display_name: + epg_obj.name = display_name + epgs_to_update.append(epg_obj) + else: + epgs_to_create.append(EPGData( + tvg_id=tvg_id, + name=display_name, + epg_source=source, + )) + + processed_channels += 1 + + # Batch processing + if len(epgs_to_create) >= batch_size: + EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) + epgs_to_create = [] + # Force garbage collection + gc.collect() + + if len(epgs_to_update) >= batch_size: + EPGData.objects.bulk_update(epgs_to_update, ["name"]) + epgs_to_update = [] + # Force garbage collection + gc.collect() + + # Send progress updates + if processed_channels % 100 == 0 or processed_channels == total_channels: + progress = 25 + int((processed_channels / total_channels) * 65) if total_channels > 0 else 90 + send_epg_update( + source.id, + "parsing_channels", + progress, + processed=processed_channels, + total=total_channels + ) + + # Clear memory + elem.clear() + while elem.getprevious() is not None: + del elem.getparent()[0] + + # Make sure to close the file + source_file.close() + + except (etree.XMLSyntaxError, Exception) as xml_error: + # Instead of falling back, just handle the error + logger.error(f"XML parsing failed: {xml_error}") + # Update status to error + source.status = 'error' + source.last_message = f"Error parsing XML file: {str(xml_error)}" + source.save(update_fields=['status', 'last_message']) + send_epg_update(source.id, "parsing_channels", 100, status="error", error=str(xml_error)) + return False + + # Process any remaining items if epgs_to_create: EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) if epgs_to_update: EPGData.objects.bulk_update(epgs_to_update, ["name"]) + # Final garbage collection + gc.collect() + + # Update source status with channel count + source.status = 'success' + source.last_message = f"Successfully parsed {processed_channels} channels" + source.save(update_fields=['status', 'last_message']) + # Send completion notification - send_epg_update(source.id, "parsing_channels", 100, status="success") + send_epg_update( + source.id, + "parsing_channels", + 100, + status="success", + channels_count=processed_channels + ) channel_layer = get_channel_layer() async_to_sync(channel_layer.group_send)( @@ -565,7 +658,7 @@ def parse_channels_only(source): } ) - logger.info("Finished parsing channel info.") + logger.info(f"Finished parsing channel info. Found {processed_channels} channels.") return True except FileNotFoundError: @@ -592,199 +685,183 @@ def parse_programs_for_tvg_id(epg_id): logger.info(f"Program parse for {epg_id} already in progress, skipping duplicate task") return "Task already running" - epg = EPGData.objects.get(id=epg_id) - epg_source = epg.epg_source + try: + epg = EPGData.objects.get(id=epg_id) + epg_source = epg.epg_source - if not Channel.objects.filter(epg_data=epg).exists(): - logger.info(f"No channels matched to EPG {epg.tvg_id}") - release_task_lock('parse_epg_programs', epg_id) - return - - logger.info(f"Refreshing program data for tvg_id: {epg.tvg_id}") - - # First, remove all existing programs - ProgramData.objects.filter(epg=epg).delete() - - file_path = epg_source.file_path - if not file_path: - file_path = epg_source.get_cache_file() - - # Check if the file exists - if not os.path.exists(file_path): - logger.error(f"EPG file not found at: {file_path}") - - # Update the file path in the database - new_path = epg_source.get_cache_file() - logger.info(f"Updating file_path from '{file_path}' to '{new_path}'") - epg_source.file_path = new_path - epg_source.save(update_fields=['file_path']) - - # Fetch new data before continuing - if epg_source.url: - logger.info(f"Fetching new EPG data from URL: {epg_source.url}") - # Properly check the return value from fetch_xmltv - fetch_success = fetch_xmltv(epg_source) - - # If fetch was not successful or the file still doesn't exist, abort - if not fetch_success: - logger.error(f"Failed to fetch EPG data, cannot parse programs for tvg_id: {epg.tvg_id}") - # Update status to error if not already set - epg_source.status = 'error' - epg_source.last_message = f"Failed to download EPG data, cannot parse programs" - epg_source.save(update_fields=['status', 'last_message']) - send_epg_update(epg_source.id, "parsing_programs", 100, status="error", error="Failed to download EPG file") - release_task_lock('parse_epg_programs', epg_id) - return - - # Also check if the file exists after download - if not os.path.exists(new_path): - logger.error(f"Failed to fetch EPG data, file still missing at: {new_path}") - epg_source.status = 'error' - epg_source.last_message = f"Failed to download EPG data, file missing after download" - epg_source.save(update_fields=['status', 'last_message']) - send_epg_update(epg_source.id, "parsing_programs", 100, status="error", error="File not found after download") - release_task_lock('parse_epg_programs', epg_id) - return - else: - logger.error(f"No URL provided for EPG source {epg_source.name}, cannot fetch new data") - # Update status to error - epg_source.status = 'error' - epg_source.last_message = f"No URL provided, cannot fetch EPG data" - epg_source.save(update_fields=['status', 'last_message']) - send_epg_update(epg_source.id, "parsing_programs", 100, status="error", error="No URL provided") + if not Channel.objects.filter(epg_data=epg).exists(): + logger.info(f"No channels matched to EPG {epg.tvg_id}") release_task_lock('parse_epg_programs', epg_id) return - file_path = new_path + logger.info(f"Refreshing program data for tvg_id: {epg.tvg_id}") - # Read entire file (decompress if .gz) - try: - if file_path.endswith('.gz'): - with open(file_path, 'rb') as gz_file: - decompressed = gzip.decompress(gz_file.read()) - xml_data = decompressed.decode('utf-8') - else: - with open(file_path, 'r', encoding='utf-8') as xml_file: - xml_data = xml_file.read() - except FileNotFoundError: - logger.error(f"EPG file not found at: {file_path}") + # First, remove all existing programs - use chunked delete to avoid memory issues + chunk_size = 5000 + programs_to_delete = ProgramData.objects.filter(epg=epg) + total_programs = programs_to_delete.count() + + if total_programs > 0: + logger.info(f"Deleting {total_programs} existing programs for {epg.tvg_id}") + + # Get only the IDs to conserve memory + program_ids = list(programs_to_delete.values_list('id', flat=True)) + + # Delete in chunks using ID-based filtering + for i in range(0, len(program_ids), chunk_size): + chunk_ids = program_ids[i:i + chunk_size] + ProgramData.objects.filter(id__in=chunk_ids).delete() + gc.collect() # Force garbage collection after batch delete + + file_path = epg_source.file_path + if not file_path: + file_path = epg_source.get_cache_file() + + # Check if the file exists + if not os.path.exists(file_path): + logger.error(f"EPG file not found at: {file_path}") + + # Update the file path in the database + new_path = epg_source.get_cache_file() + logger.info(f"Updating file_path from '{file_path}' to '{new_path}'") + epg_source.file_path = new_path + epg_source.save(update_fields=['file_path']) + + # Fetch new data before continuing + if epg_source.url: + logger.info(f"Fetching new EPG data from URL: {epg_source.url}") + # Properly check the return value from fetch_xmltv + fetch_success = fetch_xmltv(epg_source) + + # If fetch was not successful or the file still doesn't exist, abort + if not fetch_success: + logger.error(f"Failed to fetch EPG data, cannot parse programs for tvg_id: {epg.tvg_id}") + # Update status to error if not already set + epg_source.status = 'error' + epg_source.last_message = f"Failed to download EPG data, cannot parse programs" + epg_source.save(update_fields=['status', 'last_message']) + send_epg_update(epg_source.id, "parsing_programs", 100, status="error", error="Failed to download EPG file") + release_task_lock('parse_epg_programs', epg_id) + return + + # Also check if the file exists after download + if not os.path.exists(new_path): + logger.error(f"Failed to fetch EPG data, file still missing at: {new_path}") + epg_source.status = 'error' + epg_source.last_message = f"Failed to download EPG data, file missing after download" + epg_source.save(update_fields=['status', 'last_message']) + send_epg_update(epg_source.id, "parsing_programs", 100, status="error", error="File not found after download") + release_task_lock('parse_epg_programs', epg_id) + return + else: + logger.error(f"No URL provided for EPG source {epg_source.name}, cannot fetch new data") + # Update status to error + epg_source.status = 'error' + epg_source.last_message = f"No URL provided, cannot fetch EPG data" + epg_source.save(update_fields=['status', 'last_message']) + send_epg_update(epg_source.id, "parsing_programs", 100, status="error", error="No URL provided") + release_task_lock('parse_epg_programs', epg_id) + return + + file_path = new_path + + # Use streaming parsing to reduce memory usage + is_gzipped = file_path.endswith('.gz') + + logger.info(f"Parsing programs for tvg_id={epg.tvg_id} from {file_path}") + + programs_to_create = [] + batch_size = 1000 # Process in batches to limit memory usage + + try: + # Create a parser with the desired options + parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) + + # Open the file properly + source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') + + # Stream parse the file using lxml's iterparse (without parser parameter) + program_parser = etree.iterparse(source_file, events=('end',), tag='programme') + + for _, elem in program_parser: + if elem.get('channel') == epg.tvg_id: + try: + start_time = parse_xmltv_time(elem.get('start')) + end_time = parse_xmltv_time(elem.get('stop')) + title = None + desc = None + sub_title = None + + # Efficiently process child elements + for child in elem: + if child.tag == 'title': + title = child.text or 'No Title' + elif child.tag == 'desc': + desc = child.text or '' + elif child.tag == 'sub-title': + sub_title = child.text or '' + + if not title: + title = 'No Title' + + # Extract custom properties + custom_props = extract_custom_properties(elem) + custom_properties_json = None + if custom_props: + try: + custom_properties_json = json.dumps(custom_props) + except Exception as e: + logger.error(f"Error serializing custom properties to JSON: {e}", exc_info=True) + + programs_to_create.append(ProgramData( + epg=epg, + start_time=start_time, + end_time=end_time, + title=title, + description=desc, + sub_title=sub_title, + tvg_id=epg.tvg_id, + custom_properties=custom_properties_json + )) + + # Batch processing + if len(programs_to_create) >= batch_size: + ProgramData.objects.bulk_create(programs_to_create) + logger.debug(f"Saved batch of {len(programs_to_create)} programs for {epg.tvg_id}") + programs_to_create = [] + # Force garbage collection after batch processing + gc.collect() + + except Exception as e: + logger.error(f"Error processing program for {epg.tvg_id}: {e}", exc_info=True) + + # Important: Clear the element to avoid memory leaks (lxml specific method) + elem.clear() + # Also eliminate ancestors to prevent memory leaks + while elem.getprevious() is not None: + del elem.getparent()[0] + + # Make sure to close the file + source_file.close() + + except etree.XMLSyntaxError as xml_error: + logger.error(f"XML syntax error parsing program data: {xml_error}") + raise + except Exception as e: + logger.error(f"Error parsing XML for programs: {e}", exc_info=True) + raise + + # Process any remaining items + if programs_to_create: + ProgramData.objects.bulk_create(programs_to_create) + logger.debug(f"Saved final batch of {len(programs_to_create)} programs for {epg.tvg_id}") + + # Final garbage collection + gc.collect() + + logger.info(f"Completed program parsing for tvg_id={epg.tvg_id}.") + finally: release_task_lock('parse_epg_programs', epg_id) - return - except Exception as e: - logger.error(f"Error reading EPG file {file_path}: {e}", exc_info=True) - release_task_lock('parse_epg_programs', epg_id) - return - - root = ET.fromstring(xml_data) - - # Find only elements for this tvg_id - matched_programmes = [p for p in root.findall('programme') if p.get('channel') == epg.tvg_id] - logger.debug(f"Found {len(matched_programmes)} programmes for tvg_id={epg.tvg_id}") - - programs_to_create = [] - for prog in matched_programmes: - start_time = parse_xmltv_time(prog.get('start')) - end_time = parse_xmltv_time(prog.get('stop')) - title = prog.findtext('title', default='No Title') - desc = prog.findtext('desc', default='') - sub_title = prog.findtext('sub-title', default='') - - # Extract custom properties - custom_props = {} - - # Extract categories - categories = [] - for cat_elem in prog.findall('category'): - if cat_elem.text and cat_elem.text.strip(): - categories.append(cat_elem.text.strip()) - if categories: - custom_props['categories'] = categories - - # Extract episode numbers - for ep_num in prog.findall('episode-num'): - system = ep_num.get('system', '') - if system == 'xmltv_ns' and ep_num.text: - # Parse XMLTV episode-num format (season.episode.part) - parts = ep_num.text.split('.') - if len(parts) >= 2: - if parts[0].strip() != '': - try: - season = int(parts[0]) + 1 # XMLTV format is zero-based - custom_props['season'] = season - except ValueError: - pass - if parts[1].strip() != '': - try: - episode = int(parts[1]) + 1 # XMLTV format is zero-based - custom_props['episode'] = episode - except ValueError: - pass - elif system == 'onscreen' and ep_num.text: - # Just store the raw onscreen format - custom_props['onscreen_episode'] = ep_num.text.strip() - - # Extract ratings - for rating_elem in prog.findall('rating'): - if rating_elem.findtext('value'): - custom_props['rating'] = rating_elem.findtext('value').strip() - if rating_elem.get('system'): - custom_props['rating_system'] = rating_elem.get('system') - break # Just use the first rating - - # Extract credits (actors, directors, etc.) - credits_elem = prog.find('credits') - if credits_elem is not None: - credits = {} - for credit_type in ['director', 'actor', 'writer', 'presenter', 'producer']: - elements = credits_elem.findall(credit_type) - if elements: - names = [e.text.strip() for e in elements if e.text and e.text.strip()] - if names: - credits[credit_type] = names - if credits: - custom_props['credits'] = credits - - # Extract other common program metadata - if prog.findtext('date'): - custom_props['year'] = prog.findtext('date').strip()[:4] # Just the year part - - if prog.findtext('country'): - custom_props['country'] = prog.findtext('country').strip() - - for icon_elem in prog.findall('icon'): - if icon_elem.get('src'): - custom_props['icon'] = icon_elem.get('src') - break # Just use the first icon - - for kw in ['previously-shown', 'premiere', 'new']: - if prog.find(kw) is not None: - custom_props[kw.replace('-', '_')] = True - - # Convert custom_props to JSON string if not empty - custom_properties_json = None - if custom_props: - import json - try: - custom_properties_json = json.dumps(custom_props) - except Exception as e: - logger.error(f"Error serializing custom properties to JSON: {e}", exc_info=True) - - programs_to_create.append(ProgramData( - epg=epg, - start_time=start_time, - end_time=end_time, - title=title, - description=desc, - sub_title=sub_title, - tvg_id=epg.tvg_id, - custom_properties=custom_properties_json - )) - - ProgramData.objects.bulk_create(programs_to_create) - - release_task_lock('parse_epg_programs', epg_id) - - logger.info(f"Completed program parsing for tvg_id={epg.tvg_id}.") def parse_programs_for_source(epg_source, tvg_id=None): @@ -966,3 +1043,81 @@ def parse_schedules_direct_time(time_str): except Exception as e: logger.error(f"Error parsing Schedules Direct time '{time_str}': {e}", exc_info=True) raise + + +# Helper function to extract custom properties - moved to a separate function to clean up the code +def extract_custom_properties(prog): + custom_props = {} + + # Extract categories + categories = [] + for cat_elem in prog.findall('category'): + if cat_elem.text and cat_elem.text.strip(): + categories.append(cat_elem.text.strip()) + if categories: + custom_props['categories'] = categories + + # Extract episode numbers + for ep_num in prog.findall('episode-num'): + system = ep_num.get('system', '') + if system == 'xmltv_ns' and ep_num.text: + # Parse XMLTV episode-num format (season.episode.part) + parts = ep_num.text.split('.') + if len(parts) >= 2: + if parts[0].strip() != '': + try: + season = int(parts[0]) + 1 # XMLTV format is zero-based + custom_props['season'] = season + except ValueError: + pass + if parts[1].strip() != '': + try: + episode = int(parts[1]) + 1 # XMLTV format is zero-based + custom_props['episode'] = episode + except ValueError: + pass + elif system == 'onscreen' and ep_num.text: + # Just store the raw onscreen format + custom_props['onscreen_episode'] = ep_num.text.strip() + + # Extract ratings + for rating_elem in prog.findall('rating'): + value_elem = rating_elem.find('value') + if value_elem is not None and value_elem.text: + custom_props['rating'] = value_elem.text.strip() + if rating_elem.get('system'): + custom_props['rating_system'] = rating_elem.get('system') + break # Just use the first rating + + # Extract credits (actors, directors, etc.) + credits_elem = prog.find('credits') + if credits_elem is not None: + credits = {} + for credit_type in ['director', 'actor', 'writer', 'presenter', 'producer']: + elements = credits_elem.findall(credit_type) + if elements: + names = [e.text.strip() for e in elements if e.text and e.text.strip()] + if names: + credits[credit_type] = names + if credits: + custom_props['credits'] = credits + + # Extract other common program metadata + date_elem = prog.find('date') + if date_elem is not None and date_elem.text: + custom_props['year'] = date_elem.text.strip()[:4] # Just the year part + + country_elem = prog.find('country') + if country_elem is not None and country_elem.text: + custom_props['country'] = country_elem.text.strip() + + for icon_elem in prog.findall('icon'): + if icon_elem.get('src'): + custom_props['icon'] = icon_elem.get('src') + break # Just use the first icon + + for kw in ['previously-shown', 'premiere', 'new']: + if prog.find(kw) is not None: + custom_props[kw.replace('-', '_')] = True + + return custom_props diff --git a/core/utils.py b/core/utils.py index 6b5e6815..01463ad9 100644 --- a/core/utils.py +++ b/core/utils.py @@ -59,6 +59,10 @@ class RedisClient: client.config_set('save', '') # Disable RDB snapshots client.config_set('appendonly', 'no') # Disable AOF logging + # Set optimal memory settings + client.config_set('maxmemory-policy', 'allkeys-lru') # Use LRU eviction + client.config_set('maxmemory', '256mb') # Set reasonable memory limit + # Disable protected mode when in debug mode if os.environ.get('DISPATCHARR_DEBUG', '').lower() == 'true': client.config_set('protected-mode', 'no') # Disable protected mode in debug @@ -178,3 +182,34 @@ def send_websocket_event(event, success, data): "data": {"success": True, "type": "epg_channels"} } ) + +# Add memory monitoring utilities +def get_memory_usage(): + """Returns current memory usage in MB""" + import psutil + process = psutil.Process(os.getpid()) + return process.memory_info().rss / (1024 * 1024) + +def monitor_memory_usage(func): + """Decorator to monitor memory usage before and after function execution""" + def wrapper(*args, **kwargs): + import gc + # Force garbage collection before measuring + gc.collect() + + # Get initial memory usage + start_mem = get_memory_usage() + logger.debug(f"Memory usage before {func.__name__}: {start_mem:.2f} MB") + + # Call the original function + result = func(*args, **kwargs) + + # Force garbage collection before measuring again + gc.collect() + + # Get final memory usage + end_mem = get_memory_usage() + logger.debug(f"Memory usage after {func.__name__}: {end_mem:.2f} MB (Change: {end_mem - start_mem:.2f} MB)") + + return result + return wrapper diff --git a/dispatcharr/settings.py b/dispatcharr/settings.py index 02d04597..9eb7dd2b 100644 --- a/dispatcharr/settings.py +++ b/dispatcharr/settings.py @@ -43,6 +43,34 @@ INSTALLED_APPS = [ 'django_celery_beat', ] +# EPG Processing optimization settings +EPG_BATCH_SIZE = 1000 # Number of records to process in a batch +EPG_MEMORY_LIMIT = 512 # Memory limit in MB before forcing garbage collection +EPG_ENABLE_MEMORY_MONITORING = True # Whether to monitor memory usage during processing + +# Database optimization settings +DATABASE_STATEMENT_TIMEOUT = 300 # Seconds before timing out long-running queries +DATABASE_CONN_MAX_AGE = 60 # Connection max age in seconds, helps with frequent reconnects + +# Disable atomic requests for performance-sensitive views +ATOMIC_REQUESTS = False + +# Cache settings - add caching for EPG operations +CACHES = { + 'default': { + 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', + 'LOCATION': 'dispatcharr-epg-cache', + 'TIMEOUT': 3600, # 1 hour cache timeout + 'OPTIONS': { + 'MAX_ENTRIES': 10000, + 'CULL_FREQUENCY': 3, # Purge 1/3 of entries when max is reached + } + } +} + +# Timeouts for external connections +REQUESTS_TIMEOUT = 30 # Seconds for external API requests + MIDDLEWARE = [ 'django.middleware.security.SecurityMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', diff --git a/requirements.txt b/requirements.txt index 7d7117f4..8810e336 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,5 @@ channels channels-redis django-filter django-celery-beat +memory-profiler==0.61.0 +lxml==5.4.0 From 7fe618b0372c89734612dee984fccd0648114984 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Fri, 16 May 2025 22:15:21 -0500 Subject: [PATCH 02/25] Much better memory usage. About half as much --- apps/epg/tasks.py | 129 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 29 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 395f49cb..7f042e3a 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -519,7 +519,24 @@ def parse_channels_only(source): file_path = new_path logger.info(f"Parsing channels from EPG file: {file_path}") - existing_epgs = {e.tvg_id: e for e in EPGData.objects.filter(epg_source=source)} + + # Replace full dictionary load with more efficient lookup set + existing_tvg_ids = set() + existing_epgs = {} # Initialize the dictionary that will lazily load objects + last_id = 0 + chunk_size = 5000 + + while True: + tvg_id_chunk = set(EPGData.objects.filter( + epg_source=source, + id__gt=last_id + ).order_by('id').values_list('tvg_id', flat=True)[:chunk_size]) + + if not tvg_id_chunk: + break + + existing_tvg_ids.update(tvg_id_chunk) + last_id = EPGData.objects.filter(tvg_id__in=tvg_id_chunk).order_by('-id')[0].id # Update progress to show file read starting send_epg_update(source.id, "parsing_channels", 10) @@ -545,7 +562,10 @@ def parse_channels_only(source): channel_finder = etree.iterparse(source_file, events=('end',), tag='channel') # Count channels - total_channels = sum(1 for _ in channel_finder) + try: + total_channels = EPGData.objects.filter(epg_source=source).count() + except: + total_channels = 500 # Default estimate # Close the file to reset position source_file.close() @@ -569,7 +589,22 @@ def parse_channels_only(source): if not display_name: display_name = tvg_id - if tvg_id in existing_epgs: + # Use lazy loading approach to reduce memory usage + if tvg_id in existing_tvg_ids: + # Only fetch the object if we need to update it and it hasn't been loaded yet + if tvg_id not in existing_epgs: + try: + existing_epgs[tvg_id] = EPGData.objects.get(tvg_id=tvg_id, epg_source=source) + except EPGData.DoesNotExist: + # Handle race condition where record was deleted + existing_tvg_ids.remove(tvg_id) + epgs_to_create.append(EPGData( + tvg_id=tvg_id, + name=display_name, + epg_source=source, + )) + continue + epg_obj = existing_epgs[tvg_id] if epg_obj.name != display_name: epg_obj.name = display_name @@ -596,6 +631,11 @@ def parse_channels_only(source): # Force garbage collection gc.collect() + # Periodically clear the existing_epgs cache to prevent memory buildup + if processed_channels % 1000 == 0: + existing_epgs.clear() + gc.collect() + # Send progress updates if processed_channels % 100 == 0 or processed_channels == total_channels: progress = 25 + int((processed_channels / total_channels) * 65) if total_channels > 0 else 90 @@ -677,6 +717,10 @@ def parse_channels_only(source): source.save(update_fields=['status', 'last_message']) send_epg_update(source.id, "parsing_channels", 100, status="error", error=str(e)) return False + finally: + existing_tvg_ids = None + existing_epgs = None + gc.collect() @shared_task @@ -704,14 +748,20 @@ def parse_programs_for_tvg_id(epg_id): if total_programs > 0: logger.info(f"Deleting {total_programs} existing programs for {epg.tvg_id}") - # Get only the IDs to conserve memory - program_ids = list(programs_to_delete.values_list('id', flat=True)) + # More memory-efficient approach using cursor-based pagination + last_id = 0 + while True: + # Get batch of IDs greater than the last ID processed + id_batch = list(programs_to_delete.filter(id__gt=last_id).order_by('id').values_list('id', flat=True)[:chunk_size]) + if not id_batch: + break - # Delete in chunks using ID-based filtering - for i in range(0, len(program_ids), chunk_size): - chunk_ids = program_ids[i:i + chunk_size] - ProgramData.objects.filter(id__in=chunk_ids).delete() - gc.collect() # Force garbage collection after batch delete + # Delete this batch + ProgramData.objects.filter(id__in=id_batch).delete() + gc.collect() + + # Update last_id for next iteration + last_id = id_batch[-1] if id_batch else 0 file_path = epg_source.file_path if not file_path: @@ -869,11 +919,11 @@ def parse_programs_for_source(epg_source, tvg_id=None): send_epg_update(epg_source.id, "parsing_programs", 0) try: - epg_entries = EPGData.objects.filter(epg_source=epg_source) - total_entries = epg_entries.count() - processed = 0 + # Process EPG entries in batches rather than all at once + batch_size = 20 # Process fewer channels at once to reduce memory usage + epg_count = EPGData.objects.filter(epg_source=epg_source).count() - if total_entries == 0: + if epg_count == 0: logger.info(f"No EPG entries found for source: {epg_source.name}") # Update status - this is not an error, just no entries epg_source.status = 'success' @@ -881,31 +931,52 @@ def parse_programs_for_source(epg_source, tvg_id=None): send_epg_update(epg_source.id, "parsing_programs", 100, status="success") return True - logger.info(f"Parsing programs for {total_entries} EPG entries from source: {epg_source.name}") + logger.info(f"Parsing programs for {epg_count} EPG entries from source: {epg_source.name}") failed_entries = [] program_count = 0 channel_count = 0 updated_count = 0 + processed = 0 - for epg in epg_entries: - if epg.tvg_id: - try: - result = parse_programs_for_tvg_id(epg.id) - if result == "Task already running": - logger.info(f"Program parse for {epg.id} already in progress, skipping") + # Process in batches using cursor-based approach to limit memory usage + last_id = 0 + while True: + # Get a batch of EPG entries + batch_entries = list(EPGData.objects.filter( + epg_source=epg_source, + id__gt=last_id + ).order_by('id')[:batch_size]) - processed += 1 - progress = min(95, int((processed / total_entries) * 100)) if total_entries > 0 else 50 - send_epg_update(epg_source.id, "parsing_programs", progress) - except Exception as e: - logger.error(f"Error parsing programs for tvg_id={epg.tvg_id}: {e}", exc_info=True) - failed_entries.append(f"{epg.tvg_id}: {str(e)}") + if not batch_entries: + break # No more entries to process + + # Update last_id for next iteration + last_id = batch_entries[-1].id + + # Process this batch + for epg in batch_entries: + if epg.tvg_id: + try: + result = parse_programs_for_tvg_id(epg.id) + if result == "Task already running": + logger.info(f"Program parse for {epg.id} already in progress, skipping") + + processed += 1 + progress = min(95, int((processed / epg_count) * 100)) if epg_count > 0 else 50 + send_epg_update(epg_source.id, "parsing_programs", progress) + except Exception as e: + logger.error(f"Error parsing programs for tvg_id={epg.tvg_id}: {e}", exc_info=True) + failed_entries.append(f"{epg.tvg_id}: {str(e)}") + + # Force garbage collection after each batch + batch_entries = None # Remove reference to help garbage collection + gc.collect() # If there were failures, include them in the message but continue if failed_entries: epg_source.status = EPGSource.STATUS_SUCCESS # Still mark as success if some processed - error_summary = f"Failed to parse {len(failed_entries)} of {total_entries} entries" + error_summary = f"Failed to parse {len(failed_entries)} of {epg_count} entries" stats_summary = f"Processed {program_count} programs across {channel_count} channels. Updated: {updated_count}." epg_source.last_message = f"{stats_summary} Warning: {error_summary}" epg_source.updated_at = timezone.now() @@ -1027,7 +1098,7 @@ def parse_xmltv_time(time_str): elif tz_sign == '-': dt_obj = dt_obj + timedelta(hours=tz_hours, minutes=tz_minutes) aware_dt = timezone.make_aware(dt_obj, timezone=dt_timezone.utc) - logger.debug(f"Parsed XMLTV time '{time_str}' to {aware_dt}") + logger.trace(f"Parsed XMLTV time '{time_str}' to {aware_dt}") return aware_dt except Exception as e: logger.error(f"Error parsing XMLTV time '{time_str}': {e}", exc_info=True) From 1174e2e0c73dbc0cda36676946b237b74a9d1c24 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sat, 17 May 2025 16:42:37 -0500 Subject: [PATCH 03/25] EPG processing enhancements. Celery memory management. --- apps/channels/tasks.py | 260 ++++++++++++++++++----------------- apps/epg/tasks.py | 292 ++++++++++++++++++++++++++++++++++------ core/tasks.py | 34 ++++- dispatcharr/celery.py | 19 +++ dispatcharr/settings.py | 9 ++ docker/uwsgi.debug.ini | 2 +- 6 files changed, 444 insertions(+), 172 deletions(-) diff --git a/apps/channels/tasks.py b/apps/channels/tasks.py index 88d040e8..b4de5e07 100755 --- a/apps/channels/tasks.py +++ b/apps/channels/tasks.py @@ -63,146 +63,162 @@ def match_epg_channels(): 4) If a match is found, we set channel.tvg_id 5) Summarize and log results. """ - logger.info("Starting EPG matching logic...") - - # Attempt to retrieve a "preferred-region" if configured try: - region_obj = CoreSettings.objects.get(key="preferred-region") - region_code = region_obj.value.strip().lower() - except CoreSettings.DoesNotExist: - region_code = None + logger.info("Starting EPG matching logic...") - matched_channels = [] - channels_to_update = [] + # Attempt to retrieve a "preferred-region" if configured + try: + region_obj = CoreSettings.objects.get(key="preferred-region") + region_code = region_obj.value.strip().lower() + except CoreSettings.DoesNotExist: + region_code = None - # Get channels that don't have EPG data assigned - channels_without_epg = Channel.objects.filter(epg_data__isnull=True) - logger.info(f"Found {channels_without_epg.count()} channels without EPG data") + matched_channels = [] + channels_to_update = [] - channels_json = [] - for channel in channels_without_epg: - # Normalize TVG ID - strip whitespace and convert to lowercase - normalized_tvg_id = channel.tvg_id.strip().lower() if channel.tvg_id else "" - if normalized_tvg_id: - logger.info(f"Processing channel {channel.id} '{channel.name}' with TVG ID='{normalized_tvg_id}'") + # Get channels that don't have EPG data assigned + channels_without_epg = Channel.objects.filter(epg_data__isnull=True) + logger.info(f"Found {channels_without_epg.count()} channels without EPG data") - channels_json.append({ - "id": channel.id, - "name": channel.name, - "tvg_id": normalized_tvg_id, # Use normalized TVG ID - "original_tvg_id": channel.tvg_id, # Keep original for reference - "fallback_name": normalized_tvg_id if normalized_tvg_id else channel.name, - "norm_chan": normalize_name(normalized_tvg_id if normalized_tvg_id else channel.name) - }) + channels_json = [] + for channel in channels_without_epg: + # Normalize TVG ID - strip whitespace and convert to lowercase + normalized_tvg_id = channel.tvg_id.strip().lower() if channel.tvg_id else "" + if normalized_tvg_id: + logger.info(f"Processing channel {channel.id} '{channel.name}' with TVG ID='{normalized_tvg_id}'") - # Similarly normalize EPG data TVG IDs - epg_json = [] - for epg in EPGData.objects.all(): - normalized_tvg_id = epg.tvg_id.strip().lower() if epg.tvg_id else "" - epg_json.append({ - 'id': epg.id, - 'tvg_id': normalized_tvg_id, # Use normalized TVG ID - 'original_tvg_id': epg.tvg_id, # Keep original for reference - 'name': epg.name, - 'norm_name': normalize_name(epg.name), - 'epg_source_id': epg.epg_source.id if epg.epg_source else None, - }) + channels_json.append({ + "id": channel.id, + "name": channel.name, + "tvg_id": normalized_tvg_id, # Use normalized TVG ID + "original_tvg_id": channel.tvg_id, # Keep original for reference + "fallback_name": normalized_tvg_id if normalized_tvg_id else channel.name, + "norm_chan": normalize_name(normalized_tvg_id if normalized_tvg_id else channel.name) + }) - # Log available EPG data TVG IDs for debugging - unique_epg_tvg_ids = set(e['tvg_id'] for e in epg_json if e['tvg_id']) - logger.info(f"Available EPG TVG IDs: {', '.join(sorted(unique_epg_tvg_ids))}") + # Similarly normalize EPG data TVG IDs + epg_json = [] + for epg in EPGData.objects.all(): + normalized_tvg_id = epg.tvg_id.strip().lower() if epg.tvg_id else "" + epg_json.append({ + 'id': epg.id, + 'tvg_id': normalized_tvg_id, # Use normalized TVG ID + 'original_tvg_id': epg.tvg_id, # Keep original for reference + 'name': epg.name, + 'norm_name': normalize_name(epg.name), + 'epg_source_id': epg.epg_source.id if epg.epg_source else None, + }) - payload = { - "channels": channels_json, - "epg_data": epg_json, - "region_code": region_code, - } + # Log available EPG data TVG IDs for debugging + unique_epg_tvg_ids = set(e['tvg_id'] for e in epg_json if e['tvg_id']) + logger.info(f"Available EPG TVG IDs: {', '.join(sorted(unique_epg_tvg_ids))}") - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_file.write(json.dumps(payload).encode('utf-8')) - temp_file_path = temp_file.name - - process = subprocess.Popen( - ['python', '/app/scripts/epg_match.py', temp_file_path], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - - # Log stderr in real-time - for line in iter(process.stderr.readline, ''): - if line: - logger.info(line.strip()) - - process.stderr.close() - stdout, stderr = process.communicate() - - os.remove(temp_file_path) - - if process.returncode != 0: - return f"Failed to process EPG matching: {stderr}" - - result = json.loads(stdout) - # This returns lists of dicts, not model objects - channels_to_update_dicts = result["channels_to_update"] - matched_channels = result["matched_channels"] - - # Convert your dict-based 'channels_to_update' into real Channel objects - if channels_to_update_dicts: - # Extract IDs of the channels that need updates - channel_ids = [d["id"] for d in channels_to_update_dicts] - - # Fetch them from DB - channels_qs = Channel.objects.filter(id__in=channel_ids) - channels_list = list(channels_qs) - - # Build a map from channel_id -> epg_data_id (or whatever fields you need) - epg_mapping = { - d["id"]: d["epg_data_id"] for d in channels_to_update_dicts + payload = { + "channels": channels_json, + "epg_data": epg_json, + "region_code": region_code, } - # Populate each Channel object with the updated epg_data_id - for channel_obj in channels_list: - # The script sets 'epg_data_id' in the returned dict - # We either assign directly, or fetch the EPGData instance if needed. - channel_obj.epg_data_id = epg_mapping.get(channel_obj.id) + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file.write(json.dumps(payload).encode('utf-8')) + temp_file_path = temp_file.name - # Now we have real model objects, so bulk_update will work - Channel.objects.bulk_update(channels_list, ["epg_data"]) + # After writing to the file but before subprocess + # Explicitly delete the large data structures + del payload + gc.collect() - total_matched = len(matched_channels) - if total_matched: - logger.info(f"Match Summary: {total_matched} channel(s) matched.") - for (cid, cname, tvg) in matched_channels: - logger.info(f" - Channel ID={cid}, Name='{cname}' => tvg_id='{tvg}'") - else: - logger.info("No new channels were matched.") + process = subprocess.Popen( + ['python', '/app/scripts/epg_match.py', temp_file_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) - logger.info("Finished EPG matching logic.") + # Log stderr in real-time + for line in iter(process.stderr.readline, ''): + if line: + logger.info(line.strip()) - # Send update with additional information for refreshing UI - channel_layer = get_channel_layer() - associations = [ - {"channel_id": chan["id"], "epg_data_id": chan["epg_data_id"]} - for chan in channels_to_update_dicts - ] + process.stderr.close() + stdout, stderr = process.communicate() - async_to_sync(channel_layer.group_send)( - 'updates', - { - 'type': 'update', - "data": { - "success": True, - "type": "epg_match", - "refresh_channels": True, # Flag to tell frontend to refresh channels - "matches_count": total_matched, - "message": f"EPG matching complete: {total_matched} channel(s) matched", - "associations": associations # Add the associations data + os.remove(temp_file_path) + + if process.returncode != 0: + return f"Failed to process EPG matching: {stderr}" + + result = json.loads(stdout) + # This returns lists of dicts, not model objects + channels_to_update_dicts = result["channels_to_update"] + matched_channels = result["matched_channels"] + + # Explicitly clean up large objects + del stdout, result + gc.collect() + + # Convert your dict-based 'channels_to_update' into real Channel objects + if channels_to_update_dicts: + # Extract IDs of the channels that need updates + channel_ids = [d["id"] for d in channels_to_update_dicts] + + # Fetch them from DB + channels_qs = Channel.objects.filter(id__in=channel_ids) + channels_list = list(channels_qs) + + # Build a map from channel_id -> epg_data_id (or whatever fields you need) + epg_mapping = { + d["id"]: d["epg_data_id"] for d in channels_to_update_dicts } - } - ) - return f"Done. Matched {total_matched} channel(s)." + # Populate each Channel object with the updated epg_data_id + for channel_obj in channels_list: + # The script sets 'epg_data_id' in the returned dict + # We either assign directly, or fetch the EPGData instance if needed. + channel_obj.epg_data_id = epg_mapping.get(channel_obj.id) + + # Now we have real model objects, so bulk_update will work + Channel.objects.bulk_update(channels_list, ["epg_data"]) + + total_matched = len(matched_channels) + if total_matched: + logger.info(f"Match Summary: {total_matched} channel(s) matched.") + for (cid, cname, tvg) in matched_channels: + logger.info(f" - Channel ID={cid}, Name='{cname}' => tvg_id='{tvg}'") + else: + logger.info("No new channels were matched.") + + logger.info("Finished EPG matching logic.") + + # Send update with additional information for refreshing UI + channel_layer = get_channel_layer() + associations = [ + {"channel_id": chan["id"], "epg_data_id": chan["epg_data_id"]} + for chan in channels_to_update_dicts + ] + + async_to_sync(channel_layer.group_send)( + 'updates', + { + 'type': 'update', + "data": { + "success": True, + "type": "epg_match", + "refresh_channels": True, # Flag to tell frontend to refresh channels + "matches_count": total_matched, + "message": f"EPG matching complete: {total_matched} channel(s) matched", + "associations": associations # Add the associations data + } + } + ) + + return f"Done. Matched {total_matched} channel(s)." + finally: + # Final cleanup + gc.collect() + # Force an even more aggressive cleanup + import gc + gc.collect(generation=2) @shared_task diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 7f042e3a..74a77f0e 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -117,6 +117,8 @@ def refresh_all_epg_data(): for source in active_sources: refresh_epg_data(source.id) + # Force garbage collection between sources + gc.collect() logger.info("Finished refresh_epg_data task.") return "EPG data refreshed." @@ -128,6 +130,7 @@ def refresh_epg_data(source_id): logger.debug(f"EPG refresh for {source_id} already running") return + source = None try: # Try to get the EPG source try: @@ -144,12 +147,16 @@ def refresh_epg_data(source_id): # Release the lock and exit release_task_lock('refresh_epg_data', source_id) + # Force garbage collection before exit + gc.collect() return f"EPG source {source_id} does not exist, task cleaned up" # The source exists but is not active, just skip processing if not source.is_active: logger.info(f"EPG source {source_id} is not active. Skipping.") release_task_lock('refresh_epg_data', source_id) + # Force garbage collection before exit + gc.collect() return # Continue with the normal processing... @@ -159,12 +166,16 @@ def refresh_epg_data(source_id): if not fetch_success: logger.error(f"Failed to fetch XMLTV for source {source.name}") release_task_lock('refresh_epg_data', source_id) + # Force garbage collection before exit + gc.collect() return parse_channels_success = parse_channels_only(source) if not parse_channels_success: logger.error(f"Failed to parse channels for source {source.name}") release_task_lock('refresh_epg_data', source_id) + # Force garbage collection before exit + gc.collect() return parse_programs_for_source(source) @@ -176,14 +187,18 @@ def refresh_epg_data(source_id): except Exception as e: logger.error(f"Error in refresh_epg_data for source {source_id}: {e}", exc_info=True) try: - source = EPGSource.objects.get(id=source_id) - source.status = 'error' - source.last_message = f"Error refreshing EPG data: {str(e)}" - source.save(update_fields=['status', 'last_message']) - send_epg_update(source_id, "refresh", 100, status="error", error=str(e)) + if source: + source.status = 'error' + source.last_message = f"Error refreshing EPG data: {str(e)}" + source.save(update_fields=['status', 'last_message']) + send_epg_update(source_id, "refresh", 100, status="error", error=str(e)) except Exception as inner_e: logger.error(f"Error updating source status: {inner_e}") finally: + # Clear references to ensure proper garbage collection + source = None + # Force garbage collection before releasing the lock + gc.collect() release_task_lock('refresh_epg_data', source_id) @@ -191,7 +206,6 @@ def fetch_xmltv(source): # Handle cases with local file but no URL if not source.url and source.file_path and os.path.exists(source.file_path): logger.info(f"Using existing local file for EPG source: {source.name} at {source.file_path}") - # Set the status to success in the database source.status = 'success' source.save(update_fields=['status']) @@ -350,6 +364,9 @@ def fetch_xmltv(source): downloaded=f"{downloaded / (1024 * 1024):.2f} MB" ) + # Explicitly delete the chunk to free memory immediately + del chunk + # Send completion notification send_epg_update(source.id, "downloading", 100) @@ -517,9 +534,14 @@ def parse_channels_only(source): return False file_path = new_path - logger.info(f"Parsing channels from EPG file: {file_path}") + # Add memory tracking at start + import psutil + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"Initial memory usage: {initial_memory:.2f} MB") + # Replace full dictionary load with more efficient lookup set existing_tvg_ids = set() existing_epgs = {} # Initialize the dictionary that will lazily load objects @@ -537,7 +559,7 @@ def parse_channels_only(source): existing_tvg_ids.update(tvg_id_chunk) last_id = EPGData.objects.filter(tvg_id__in=tvg_id_chunk).order_by('-id')[0].id - + #time.sleep(20) # Update progress to show file read starting send_epg_update(source.id, "parsing_channels", 10) @@ -549,35 +571,49 @@ def parse_channels_only(source): total_channels = 0 processed_channels = 0 batch_size = 500 # Process in batches to limit memory usage + progress = 0 # Initialize progress variable here + + # Track memory at key points + logger.info(f"Memory before opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") try: # Create a parser with the desired options - parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) + #parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) # Count channels for progress reporting - use proper lxml approach # Open the file first + logger.info(f"Opening file for initial channel count: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') - - # Create an iterparse context without parser parameter - channel_finder = etree.iterparse(source_file, events=('end',), tag='channel') + logger.info(f"Memory after opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Count channels try: total_channels = EPGData.objects.filter(epg_source=source).count() - except: + logger.info(f"Found {total_channels} existing channels for this source") + except Exception as e: + logger.error(f"Error counting channels: {e}") total_channels = 500 # Default estimate # Close the file to reset position + logger.info(f"Closing initial file handle") source_file.close() + logger.info(f"Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update progress after counting send_epg_update(source.id, "parsing_channels", 25, total_channels=total_channels) # Reset file position for actual processing + logger.info(f"Re-opening file for channel parsing: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') - channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') + logger.info(f"Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"Creating iterparse context") + channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') + logger.info(f"Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") + + channel_count = 0 for _, elem in channel_parser: + channel_count += 1 tvg_id = elem.get('id', '').strip() if tvg_id: display_name = None @@ -620,10 +656,13 @@ def parse_channels_only(source): # Batch processing if len(epgs_to_create) >= batch_size: + logger.info(f"Bulk creating {len(epgs_to_create)} EPG entries") EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) + logger.info(f"Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") + del epgs_to_create # Explicit deletion epgs_to_create = [] - # Force garbage collection gc.collect() + logger.info(f"Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") if len(epgs_to_update) >= batch_size: EPGData.objects.bulk_update(epgs_to_update, ["name"]) @@ -633,8 +672,10 @@ def parse_channels_only(source): # Periodically clear the existing_epgs cache to prevent memory buildup if processed_channels % 1000 == 0: + logger.info(f"Clearing existing_epgs cache at {processed_channels} channels") existing_epgs.clear() gc.collect() + logger.info(f"Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Send progress updates if processed_channels % 100 == 0 or processed_channels == total_channels: @@ -646,17 +687,38 @@ def parse_channels_only(source): processed=processed_channels, total=total_channels ) - + logger.debug(f"Processed channel: {tvg_id} - {display_name}") # Clear memory elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] - # Make sure to close the file + # Check if we should break early to avoid excessive sleep + if processed_channels >= total_channels and total_channels > 0: + logger.info(f"Breaking channel processing loop - processed {processed_channels}/{total_channels}") + break + + # Explicit cleanup before sleeping + logger.info(f"Completed channel parsing loop, processed {processed_channels} channels") + logger.info(f"Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + + # Explicit cleanup of the parser + del channel_parser + logger.info(f"Deleted channel_parser object") + + # Close the file + logger.info(f"Closing file: {file_path}") source_file.close() + logger.info(f"File closed: {file_path}") + + # Force garbage collection + gc.collect() + logger.info(f"Memory after final cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + + # Remove long sleep that might be causing issues + # time.sleep(200) # This seems excessive and may be causing issues except (etree.XMLSyntaxError, Exception) as xml_error: - # Instead of falling back, just handle the error logger.error(f"XML parsing failed: {xml_error}") # Update status to error source.status = 'error' @@ -668,12 +730,16 @@ def parse_channels_only(source): # Process any remaining items if epgs_to_create: EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) + logger.info(f"Created final batch of {len(epgs_to_create)} EPG entries") if epgs_to_update: EPGData.objects.bulk_update(epgs_to_update, ["name"]) + logger.info(f"Updated final batch of {len(epgs_to_update)} EPG entries") - # Final garbage collection + # Final garbage collection and memory tracking + logger.info(f"Memory before final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") gc.collect() + logger.info(f"Memory after final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update source status with channel count source.status = 'success' @@ -699,6 +765,8 @@ def parse_channels_only(source): ) logger.info(f"Finished parsing channel info. Found {processed_channels} channels.") + # Remove excessive sleep + # time.sleep(20) return True except FileNotFoundError: @@ -718,9 +786,19 @@ def parse_channels_only(source): send_epg_update(source.id, "parsing_channels", 100, status="error", error=str(e)) return False finally: + # Add more detailed cleanup in finally block + logger.info("In finally block, ensuring cleanup") existing_tvg_ids = None existing_epgs = None gc.collect() + # Check final memory usage + try: + import psutil + process = psutil.Process() + final_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"Final memory usage: {final_memory:.2f} MB") + except: + pass @shared_task @@ -729,7 +807,22 @@ def parse_programs_for_tvg_id(epg_id): logger.info(f"Program parse for {epg_id} already in progress, skipping duplicate task") return "Task already running" + source_file = None + program_parser = None + programs_to_create = None + epg = None + epg_source = None + try: + # Add memory tracking + try: + import psutil + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Initial memory usage: {initial_memory:.2f} MB") + except ImportError: + process = None + epg = EPGData.objects.get(id=epg_id) epg_source = epg.epg_source @@ -756,12 +849,25 @@ def parse_programs_for_tvg_id(epg_id): if not id_batch: break + # Store the last ID before deleting the batch variable + if id_batch: + max_id = id_batch[-1] + else: + max_id = 0 + # Delete this batch ProgramData.objects.filter(id__in=id_batch).delete() + # Release memory immediately + del id_batch gc.collect() - # Update last_id for next iteration - last_id = id_batch[-1] if id_batch else 0 + # Update last_id for next iteration using our stored value + last_id = max_id + + # Explicitly delete query objects + del programs_to_delete + del last_id + gc.collect() file_path = epg_source.file_path if not file_path: @@ -820,17 +926,23 @@ def parse_programs_for_tvg_id(epg_id): logger.info(f"Parsing programs for tvg_id={epg.tvg_id} from {file_path}") + # Memory usage tracking + if process: + mem_before = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Memory before parsing: {mem_before:.2f} MB") + programs_to_create = [] batch_size = 1000 # Process in batches to limit memory usage + programs_processed = 0 try: # Create a parser with the desired options - parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) + #parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) # Open the file properly source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') - # Stream parse the file using lxml's iterparse (without parser parameter) + # Stream parse the file using lxml's iterparse program_parser = etree.iterparse(source_file, events=('end',), tag='programme') for _, elem in program_parser: @@ -858,6 +970,7 @@ def parse_programs_for_tvg_id(epg_id): custom_props = extract_custom_properties(elem) custom_properties_json = None if custom_props: + logger.debug(f"Number of custom properties: {len(custom_props)}") try: custom_properties_json = json.dumps(custom_props) except Exception as e: @@ -874,12 +987,19 @@ def parse_programs_for_tvg_id(epg_id): custom_properties=custom_properties_json )) + programs_processed += 1 + custom_props = None + custom_properties_json = None # Batch processing if len(programs_to_create) >= batch_size: ProgramData.objects.bulk_create(programs_to_create) logger.debug(f"Saved batch of {len(programs_to_create)} programs for {epg.tvg_id}") + del programs_to_create # Explicit deletion programs_to_create = [] - # Force garbage collection after batch processing + + # Force more aggressive garbage collection + custom_props = None + custom_properties_json = None gc.collect() except Exception as e: @@ -891,8 +1011,17 @@ def parse_programs_for_tvg_id(epg_id): while elem.getprevious() is not None: del elem.getparent()[0] - # Make sure to close the file - source_file.close() + # Make sure to close the file and release parser resources + if source_file: + source_file.close() + source_file = None + + if program_parser: + program_parser = None + + # Free parser memory + parser = None + gc.collect() except etree.XMLSyntaxError as xml_error: logger.error(f"XML syntax error parsing program data: {xml_error}") @@ -900,17 +1029,54 @@ def parse_programs_for_tvg_id(epg_id): except Exception as e: logger.error(f"Error parsing XML for programs: {e}", exc_info=True) raise + finally: + # Ensure file is closed even if an exception occurs + if source_file: + source_file.close() + source_file = None # Process any remaining items if programs_to_create: ProgramData.objects.bulk_create(programs_to_create) logger.debug(f"Saved final batch of {len(programs_to_create)} programs for {epg.tvg_id}") + del programs_to_create + programs_to_create = [] + + # Memory tracking after processing + if process: + mem_after = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Memory after parsing {programs_processed} programs: {mem_after:.2f} MB (change: {mem_after-mem_before:.2f} MB)") # Final garbage collection gc.collect() + # One additional garbage collection specifically for lxml elements + # which can sometimes be retained due to reference cycles + gc.collect() + + # Reset internal caches and pools that lxml might be keeping + try: + etree.clear_error_log() + except: + pass + logger.info(f"Completed program parsing for tvg_id={epg.tvg_id}.") finally: + # Explicit cleanup of all potentially large objects + if source_file: + try: + source_file.close() + except: + pass + + source_file = None + program_parser = None + programs_to_create = None + epg = None + epg_source = None + + # Force garbage collection before releasing lock + gc.collect() release_task_lock('parse_epg_programs', epg_id) @@ -918,6 +1084,16 @@ def parse_programs_for_source(epg_source, tvg_id=None): # Send initial programs parsing notification send_epg_update(epg_source.id, "parsing_programs", 0) + # Add memory tracking + try: + import psutil + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_source] Initial memory usage: {initial_memory:.2f} MB") + except ImportError: + logger.warning("psutil not available for memory tracking") + process = None + try: # Process EPG entries in batches rather than all at once batch_size = 20 # Process fewer channels at once to reduce memory usage @@ -939,6 +1115,10 @@ def parse_programs_for_source(epg_source, tvg_id=None): updated_count = 0 processed = 0 + # Memory check before batch processing + if process: + logger.info(f"[parse_programs_for_source] Memory before batch processing: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # Process in batches using cursor-based approach to limit memory usage last_id = 0 while True: @@ -969,10 +1149,18 @@ def parse_programs_for_source(epg_source, tvg_id=None): logger.error(f"Error parsing programs for tvg_id={epg.tvg_id}: {e}", exc_info=True) failed_entries.append(f"{epg.tvg_id}: {str(e)}") + # Memory check after processing batch + if process: + logger.info(f"[parse_programs_for_source] Memory after processing batch: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # Force garbage collection after each batch batch_entries = None # Remove reference to help garbage collection gc.collect() + # Memory check after garbage collection + if process: + logger.info(f"[parse_programs_for_source] Memory after gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # If there were failures, include them in the message but continue if failed_entries: epg_source.status = EPGSource.STATUS_SUCCESS # Still mark as success if some processed @@ -986,6 +1174,11 @@ def parse_programs_for_source(epg_source, tvg_id=None): send_epg_update(epg_source.id, "parsing_programs", 100, status="success", message=epg_source.last_message) + + # Explicitly release memory of large lists before returning + del failed_entries + gc.collect() + return True # If all successful, set a comprehensive success message @@ -1012,6 +1205,21 @@ def parse_programs_for_source(epg_source, tvg_id=None): status="error", message=epg_source.last_message) return False + finally: + # Final memory cleanup and tracking + if process: + # Force garbage collection before measuring + gc.collect() + final_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_source] Final memory usage: {final_memory:.2f} MB") + + # Explicitly release any remaining large data structures + failed_entries = None + program_count = None + channel_count = None + updated_count = None + processed = None + gc.collect() def fetch_schedules_direct(source): @@ -1118,13 +1326,11 @@ def parse_schedules_direct_time(time_str): # Helper function to extract custom properties - moved to a separate function to clean up the code def extract_custom_properties(prog): + # Create a new dictionary for each call custom_props = {} - # Extract categories - categories = [] - for cat_elem in prog.findall('category'): - if cat_elem.text and cat_elem.text.strip(): - categories.append(cat_elem.text.strip()) + # Extract categories with a single comprehension to reduce intermediate objects + categories = [cat.text.strip() for cat in prog.findall('category') if cat.text and cat.text.strip()] if categories: custom_props['categories'] = categories @@ -1151,25 +1357,23 @@ def extract_custom_properties(prog): # Just store the raw onscreen format custom_props['onscreen_episode'] = ep_num.text.strip() - # Extract ratings - for rating_elem in prog.findall('rating'): + # Extract ratings more efficiently + rating_elem = prog.find('rating') + if rating_elem is not None: value_elem = rating_elem.find('value') if value_elem is not None and value_elem.text: custom_props['rating'] = value_elem.text.strip() if rating_elem.get('system'): custom_props['rating_system'] = rating_elem.get('system') - break # Just use the first rating - # Extract credits (actors, directors, etc.) + # Extract credits more efficiently credits_elem = prog.find('credits') if credits_elem is not None: credits = {} for credit_type in ['director', 'actor', 'writer', 'presenter', 'producer']: - elements = credits_elem.findall(credit_type) - if elements: - names = [e.text.strip() for e in elements if e.text and e.text.strip()] - if names: - credits[credit_type] = names + names = [e.text.strip() for e in credits_elem.findall(credit_type) if e.text and e.text.strip()] + if names: + credits[credit_type] = names if credits: custom_props['credits'] = credits @@ -1182,11 +1386,11 @@ def extract_custom_properties(prog): if country_elem is not None and country_elem.text: custom_props['country'] = country_elem.text.strip() - for icon_elem in prog.findall('icon'): - if icon_elem.get('src'): - custom_props['icon'] = icon_elem.get('src') - break # Just use the first icon + icon_elem = prog.find('icon') + if icon_elem is not None and icon_elem.get('src'): + custom_props['icon'] = icon_elem.get('src') + # Simpler approach for boolean flags for kw in ['previously-shown', 'premiere', 'new']: if prog.find(kw) is not None: custom_props[kw.replace('-', '_')] = True diff --git a/core/tasks.py b/core/tasks.py index a6bd80cf..fbd9277d 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -36,11 +36,6 @@ LOG_THROTTLE_SECONDS = 300 # 5 minutes # Track if this is the first scan since startup _first_scan_completed = False -@shared_task -def beat_periodic_task(): - fetch_channel_stats() - scan_and_process_files() - def throttled_log(logger_method, message, key=None, *args, **kwargs): """Only log messages with the same key once per throttle period""" if key is None: @@ -52,6 +47,32 @@ def throttled_log(logger_method, message, key=None, *args, **kwargs): logger_method(message, *args, **kwargs) _last_log_times[key] = now +def clear_memory(): + """Force aggressive garbage collection to free memory""" + import gc + # Run full garbage collection + gc.collect(generation=2) + # Find and break any reference cycles + gc.collect(generation=0) + # Clear any cached objects in memory + gc.collect(generation=1) + # Check if psutil is available for more advanced monitoring + try: + import psutil + process = psutil.Process() + if hasattr(process, 'memory_info'): + mem = process.memory_info().rss / (1024 * 1024) + logger.debug(f"Memory usage after cleanup: {mem:.2f} MB") + except (ImportError, Exception): + pass + +@shared_task +def beat_periodic_task(): + fetch_channel_stats() + scan_and_process_files() + # Call memory cleanup after completing tasks + clear_memory() + @shared_task def scan_and_process_files(): global _first_scan_completed @@ -270,6 +291,9 @@ def scan_and_process_files(): # Mark that the first scan is complete _first_scan_completed = True + # Force memory cleanup + clear_memory() + def fetch_channel_stats(): redis_client = RedisClient.get_client() diff --git a/dispatcharr/celery.py b/dispatcharr/celery.py index a0ff2168..b0debc76 100644 --- a/dispatcharr/celery.py +++ b/dispatcharr/celery.py @@ -2,6 +2,7 @@ import os from celery import Celery import logging +from celery.signals import task_postrun # Add import for signals # Initialize with defaults before Django settings are loaded DEFAULT_LOG_LEVEL = 'DEBUG' @@ -48,6 +49,24 @@ app.conf.update( worker_task_log_format='%(asctime)s %(levelname)s %(task_name)s: %(message)s', ) +# Add memory cleanup after task completion +@task_postrun.connect # Use the imported signal +def cleanup_task_memory(**kwargs): + """Clean up memory after each task completes""" + import gc + # Force garbage collection + gc.collect() + + # Log memory usage if psutil is installed + try: + import psutil + process = psutil.Process() + if hasattr(process, 'memory_info'): + mem = process.memory_info().rss / (1024 * 1024) + print(f"Memory usage after task: {mem:.2f} MB") + except (ImportError, Exception): + pass + @app.on_after_configure.connect def setup_celery_logging(**kwargs): # Use our directly determined log level diff --git a/dispatcharr/settings.py b/dispatcharr/settings.py index 9eb7dd2b..de8464d5 100644 --- a/dispatcharr/settings.py +++ b/dispatcharr/settings.py @@ -199,6 +199,15 @@ CELERY_BROKER_TRANSPORT_OPTIONS = { CELERY_ACCEPT_CONTENT = ['json'] CELERY_TASK_SERIALIZER = 'json' +# Memory management settings +#CELERY_WORKER_MAX_TASKS_PER_CHILD = 10 # Restart worker after 10 tasks to free memory +#CELERY_WORKER_PREFETCH_MULTIPLIER = 1 # Don't prefetch tasks - process one at a time +#CELERY_TASK_ACKS_LATE = True # Only acknowledge tasks after they're processed +#CELERY_TASK_TIME_LIMIT = 3600 # 1 hour time limit per task +#CELERY_TASK_SOFT_TIME_LIMIT = 3540 # Soft limit 60 seconds before hard limit +#CELERY_WORKER_CANCEL_LONG_RUNNING_TASKS_ON_CONNECTION_LOSS = True # Cancel tasks if connection lost +#CELERY_TASK_IGNORE_RESULT = True # Don't store results unless explicitly needed + CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers.DatabaseScheduler" CELERY_BEAT_SCHEDULE = { 'fetch-channel-statuses': { diff --git a/docker/uwsgi.debug.ini b/docker/uwsgi.debug.ini index 43ecd5ce..ea567e1e 100644 --- a/docker/uwsgi.debug.ini +++ b/docker/uwsgi.debug.ini @@ -8,7 +8,7 @@ exec-before = python /app/scripts/wait_for_redis.py ; Start Redis first attach-daemon = redis-server ; Then start other services -attach-daemon = celery -A dispatcharr worker +attach-daemon = celery -A dispatcharr worker --concurrency=4 attach-daemon = celery -A dispatcharr beat attach-daemon = daphne -b 0.0.0.0 -p 8001 dispatcharr.asgi:application attach-daemon = cd /app/frontend && npm run dev From 8133af5d20d8ec633d9ff5834da3153175cdf120 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sat, 17 May 2025 17:26:18 -0500 Subject: [PATCH 04/25] Remove old parser reference. --- apps/epg/tasks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 74a77f0e..04a96ae0 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -1019,8 +1019,6 @@ def parse_programs_for_tvg_id(epg_id): if program_parser: program_parser = None - # Free parser memory - parser = None gc.collect() except etree.XMLSyntaxError as xml_error: From ed665584e997cc849c90c7016a8bd646e699fc9e Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sun, 18 May 2025 17:05:03 -0500 Subject: [PATCH 05/25] The struggle is real --- apps/epg/tasks.py | 268 ++++++++++++++++++++++++++------------------ apps/m3u/tasks.py | 18 +-- apps/proxy/tasks.py | 16 ++- core/tasks.py | 27 +++-- core/utils.py | 48 ++++++-- 5 files changed, 234 insertions(+), 143 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 04a96ae0..bc59771b 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -10,6 +10,7 @@ from datetime import datetime, timedelta, timezone as dt_timezone import gc # Add garbage collection module import json from lxml import etree # Using lxml exclusively +import psutil # Add import for memory tracking from celery import shared_task from django.conf import settings @@ -22,7 +23,7 @@ from asgiref.sync import async_to_sync from channels.layers import get_channel_layer from .models import EPGSource, EPGData, ProgramData -from core.utils import acquire_task_lock, release_task_lock +from core.utils import acquire_task_lock, release_task_lock, send_websocket_update logger = logging.getLogger(__name__) @@ -40,18 +41,18 @@ def send_epg_update(source_id, action, progress, **kwargs): # Add the additional key-value pairs from kwargs data.update(kwargs) - # Now, send the updated data dictionary - channel_layer = get_channel_layer() - try: - async_to_sync(channel_layer.group_send)( - 'updates', - { - 'type': 'update', - 'data': data - } - ) - except Exception as e: - logger.warning(f"Failed to send WebSocket update: {e}") + # Use the standardized update function with garbage collection for program parsing + # This is a high-frequency operation that needs more aggressive memory management + collect_garbage = action == "parsing_programs" and progress % 10 == 0 + send_websocket_update('updates', 'update', data, collect_garbage=collect_garbage) + + # Explicitly clear references + data = None + + # For high-frequency parsing, occasionally force additional garbage collection + # to prevent memory buildup + if action == "parsing_programs" and progress % 50 == 0: + gc.collect() def delete_epg_refresh_task_by_id(epg_id): @@ -529,18 +530,16 @@ def parse_channels_only(source): # Update status to error source.status = 'error' source.last_message = f"No URL provided, cannot fetch EPG data" - source.save(update_fields=['status', 'last_message']) - send_epg_update(source.id, "parsing_channels", 100, status="error", error="No URL provided") - return False + source.save(update_fields=['updated_at']) - file_path = new_path - logger.info(f"Parsing channels from EPG file: {file_path}") - - # Add memory tracking at start - import psutil - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"Initial memory usage: {initial_memory:.2f} MB") + # Initialize process variable for memory tracking + try: + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"Initial memory usage: {initial_memory:.2f} MB") + except (ImportError, NameError): + process = None + logger.warning("psutil not available for memory tracking") # Replace full dictionary load with more efficient lookup set existing_tvg_ids = set() @@ -574,7 +573,8 @@ def parse_channels_only(source): progress = 0 # Initialize progress variable here # Track memory at key points - logger.info(f"Memory before opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory before opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") try: # Create a parser with the desired options @@ -584,7 +584,8 @@ def parse_channels_only(source): # Open the file first logger.info(f"Opening file for initial channel count: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') - logger.info(f"Memory after opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Count channels try: @@ -597,7 +598,8 @@ def parse_channels_only(source): # Close the file to reset position logger.info(f"Closing initial file handle") source_file.close() - logger.info(f"Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update progress after counting send_epg_update(source.id, "parsing_channels", 25, total_channels=total_channels) @@ -605,11 +607,13 @@ def parse_channels_only(source): # Reset file position for actual processing logger.info(f"Re-opening file for channel parsing: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') - logger.info(f"Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") logger.info(f"Creating iterparse context") channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') - logger.info(f"Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") channel_count = 0 for _, elem in channel_parser: @@ -658,11 +662,13 @@ def parse_channels_only(source): if len(epgs_to_create) >= batch_size: logger.info(f"Bulk creating {len(epgs_to_create)} EPG entries") EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) - logger.info(f"Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") del epgs_to_create # Explicit deletion epgs_to_create = [] gc.collect() - logger.info(f"Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") if len(epgs_to_update) >= batch_size: EPGData.objects.bulk_update(epgs_to_update, ["name"]) @@ -675,7 +681,8 @@ def parse_channels_only(source): logger.info(f"Clearing existing_epgs cache at {processed_channels} channels") existing_epgs.clear() gc.collect() - logger.info(f"Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Send progress updates if processed_channels % 100 == 0 or processed_channels == total_channels: @@ -689,9 +696,29 @@ def parse_channels_only(source): ) logger.debug(f"Processed channel: {tvg_id} - {display_name}") # Clear memory - elem.clear() - while elem.getprevious() is not None: - del elem.getparent()[0] + try: + # First clear the element's content + elem.clear() + + # Get the parent before we might lose reference to it + parent = elem.getparent() + if parent is not None: + # Clean up preceding siblings + while elem.getprevious() is not None: + del parent[0] + + # Try to fully detach this element from parent + try: + parent.remove(elem) + del elem + del parent + except (ValueError, KeyError, TypeError): + # Element might already be removed or detached + pass + + except Exception as e: + # Just log the error and continue - don't let cleanup errors stop processing + logger.debug(f"Non-critical error during XML element cleanup: {e}") # Check if we should break early to avoid excessive sleep if processed_channels >= total_channels and total_channels > 0: @@ -700,7 +727,8 @@ def parse_channels_only(source): # Explicit cleanup before sleeping logger.info(f"Completed channel parsing loop, processed {processed_channels} channels") - logger.info(f"Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Explicit cleanup of the parser del channel_parser @@ -713,7 +741,8 @@ def parse_channels_only(source): # Force garbage collection gc.collect() - logger.info(f"Memory after final cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after final cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Remove long sleep that might be causing issues # time.sleep(200) # This seems excessive and may be causing issues @@ -737,9 +766,11 @@ def parse_channels_only(source): logger.info(f"Updated final batch of {len(epgs_to_update)} EPG entries") # Final garbage collection and memory tracking - logger.info(f"Memory before final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory before final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") gc.collect() - logger.info(f"Memory after final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") + if process: + logger.info(f"Memory after final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update source status with channel count source.status = 'success' @@ -755,14 +786,7 @@ def parse_channels_only(source): channels_count=processed_channels ) - channel_layer = get_channel_layer() - async_to_sync(channel_layer.group_send)( - 'updates', - { - 'type': 'update', - "data": {"success": True, "type": "epg_channels"} - } - ) + send_websocket_update('updates', 'update', {"success": True, "type": "epg_channels"}) logger.info(f"Finished parsing channel info. Found {processed_channels} channels.") # Remove excessive sleep @@ -790,15 +814,20 @@ def parse_channels_only(source): logger.info("In finally block, ensuring cleanup") existing_tvg_ids = None existing_epgs = None - gc.collect() - # Check final memory usage try: - import psutil process = psutil.Process() final_memory = process.memory_info().rss / 1024 / 1024 logger.info(f"Final memory usage: {final_memory:.2f} MB") except: pass + # Explicitly clear the process object to prevent potential memory leaks + if 'process' in locals() and process is not None: + process = None + + # Check final memory usage after clearing process + gc.collect() + + @shared_task @@ -806,17 +835,16 @@ def parse_programs_for_tvg_id(epg_id): if not acquire_task_lock('parse_epg_programs', epg_id): logger.info(f"Program parse for {epg_id} already in progress, skipping duplicate task") return "Task already running" - source_file = None program_parser = None programs_to_create = None epg = None epg_source = None + programs_processed = 0 try: # Add memory tracking try: - import psutil process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 logger.info(f"[parse_programs_for_tvg_id] Initial memory usage: {initial_memory:.2f} MB") @@ -834,39 +862,16 @@ def parse_programs_for_tvg_id(epg_id): logger.info(f"Refreshing program data for tvg_id: {epg.tvg_id}") # First, remove all existing programs - use chunked delete to avoid memory issues + # Delete old programs chunk_size = 5000 - programs_to_delete = ProgramData.objects.filter(epg=epg) - total_programs = programs_to_delete.count() - - if total_programs > 0: - logger.info(f"Deleting {total_programs} existing programs for {epg.tvg_id}") - - # More memory-efficient approach using cursor-based pagination - last_id = 0 - while True: - # Get batch of IDs greater than the last ID processed - id_batch = list(programs_to_delete.filter(id__gt=last_id).order_by('id').values_list('id', flat=True)[:chunk_size]) - if not id_batch: - break - - # Store the last ID before deleting the batch variable - if id_batch: - max_id = id_batch[-1] - else: - max_id = 0 - - # Delete this batch - ProgramData.objects.filter(id__in=id_batch).delete() - # Release memory immediately - del id_batch - gc.collect() - - # Update last_id for next iteration using our stored value - last_id = max_id - - # Explicitly delete query objects - del programs_to_delete - del last_id + last_id = 0 + while True: + ids = list(ProgramData.objects.filter(epg=epg, id__gt=last_id).order_by('id').values_list('id', flat=True)[:chunk_size]) + if not ids: + break + ProgramData.objects.filter(id__in=ids).delete() + last_id = ids[-1] + del ids gc.collect() file_path = epg_source.file_path @@ -929,7 +934,7 @@ def parse_programs_for_tvg_id(epg_id): # Memory usage tracking if process: mem_before = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_programs_for_tvg_id] Memory before parsing: {mem_before:.2f} MB") + logger.info(f"[parse_programs_for_tvg_id] Memory before parsing {epg.tvg_id} - {mem_before:.2f} MB") programs_to_create = [] batch_size = 1000 # Process in batches to limit memory usage @@ -968,9 +973,9 @@ def parse_programs_for_tvg_id(epg_id): # Extract custom properties custom_props = extract_custom_properties(elem) - custom_properties_json = None + if custom_props: - logger.debug(f"Number of custom properties: {len(custom_props)}") + logger.trace(f"Number of custom properties: {len(custom_props)}") try: custom_properties_json = json.dumps(custom_props) except Exception as e: @@ -986,30 +991,58 @@ def parse_programs_for_tvg_id(epg_id): tvg_id=epg.tvg_id, custom_properties=custom_properties_json )) - programs_processed += 1 - custom_props = None - custom_properties_json = None + del custom_props + del custom_properties_json + del start_time + del end_time + del title + del desc + del sub_title + elem.clear() + parent = elem.getparent() + if parent is not None: + while elem.getprevious() is not None: + del parent[0] + parent.remove(elem) + del elem + del parent + #gc.collect() # Batch processing if len(programs_to_create) >= batch_size: ProgramData.objects.bulk_create(programs_to_create) logger.debug(f"Saved batch of {len(programs_to_create)} programs for {epg.tvg_id}") - del programs_to_create # Explicit deletion - programs_to_create = [] - - # Force more aggressive garbage collection - custom_props = None - custom_properties_json = None + del programs_to_create + del custom_props + del custom_properties_json gc.collect() + #continue except Exception as e: logger.error(f"Error processing program for {epg.tvg_id}: {e}", exc_info=True) - # Important: Clear the element to avoid memory leaks (lxml specific method) - elem.clear() - # Also eliminate ancestors to prevent memory leaks - while elem.getprevious() is not None: - del elem.getparent()[0] + # Important: Clear the element to avoid memory leaks using a more robust approach + try: + # First clear the element's content + elem.clear() + # Get the parent before we might lose reference to it + parent = elem.getparent() + if parent is not None: + # Clean up preceding siblings + while elem.getprevious() is not None: + del parent[0] + # Try to fully detach this element from parent + try: + parent.remove(elem) + del elem + del parent + except (ValueError, KeyError, TypeError): + # Element might already be removed or detached + pass + + except Exception as e: + # Just log the error and continue - don't let cleanup errors stop processing + logger.debug(f"Non-critical error during XML element cleanup: {e}") # Make sure to close the file and release parser resources if source_file: @@ -1032,18 +1065,21 @@ def parse_programs_for_tvg_id(epg_id): if source_file: source_file.close() source_file = None + # Memory tracking after processing + if process: + mem_after = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Memory after parsing 1 {epg.tvg_id} - {programs_processed} programs: {mem_after:.2f} MB (change: {mem_after-mem_before:.2f} MB)") # Process any remaining items if programs_to_create: ProgramData.objects.bulk_create(programs_to_create) logger.debug(f"Saved final batch of {len(programs_to_create)} programs for {epg.tvg_id}") - del programs_to_create - programs_to_create = [] + programs_to_create = None + custom_props = None + custom_properties_json = None + #del programs_to_create + #programs_to_create = [] - # Memory tracking after processing - if process: - mem_after = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_programs_for_tvg_id] Memory after parsing {programs_processed} programs: {mem_after:.2f} MB (change: {mem_after-mem_before:.2f} MB)") # Final garbage collection gc.collect() @@ -1066,25 +1102,31 @@ def parse_programs_for_tvg_id(epg_id): source_file.close() except: pass - + # Memory tracking after processing + if process: + mem_after = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Memory after parsing 2 {epg.tvg_id} - {programs_processed} programs: {mem_after:.2f} MB (change: {mem_after-mem_before:.2f} MB)") source_file = None program_parser = None programs_to_create = None epg = None epg_source = None - + # Explicitly clear the process object to prevent potential memory leaks + if 'process' in locals() and process is not None: + process = None # Force garbage collection before releasing lock gc.collect() + release_task_lock('parse_epg_programs', epg_id) + def parse_programs_for_source(epg_source, tvg_id=None): # Send initial programs parsing notification send_epg_update(epg_source.id, "parsing_programs", 0) - + #time.sleep(100) # Add memory tracking try: - import psutil process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 logger.info(f"[parse_programs_for_source] Initial memory usage: {initial_memory:.2f} MB") @@ -1219,6 +1261,10 @@ def parse_programs_for_source(epg_source, tvg_id=None): processed = None gc.collect() + # Explicitly clear the process object to prevent potential memory leaks + if 'process' in locals() and process is not None: + process = None + def fetch_schedules_direct(source): logger.info(f"Fetching Schedules Direct data from source: {source.name}") diff --git a/apps/m3u/tasks.py b/apps/m3u/tasks.py index 71ee5f49..513c550d 100644 --- a/apps/m3u/tasks.py +++ b/apps/m3u/tasks.py @@ -1088,6 +1088,8 @@ def refresh_single_m3u_account(account_id): return f"Dispatched jobs complete." +from core.utils import send_websocket_update + def send_m3u_update(account_id, action, progress, **kwargs): # Start with the base data dictionary data = { @@ -1111,12 +1113,10 @@ def send_m3u_update(account_id, action, progress, **kwargs): # Add the additional key-value pairs from kwargs data.update(kwargs) - # Now, send the updated data dictionary - channel_layer = get_channel_layer() - async_to_sync(channel_layer.group_send)( - 'updates', - { - 'type': 'update', - 'data': data - } - ) + # Use the standardized function with memory management + # Enable garbage collection for certain operations + collect_garbage = action == "parsing" and progress % 25 == 0 + send_websocket_update('updates', 'update', data, collect_garbage=collect_garbage) + + # Explicitly clear data reference to help garbage collection + data = None diff --git a/apps/proxy/tasks.py b/apps/proxy/tasks.py index a4aaf8e5..00e3e039 100644 --- a/apps/proxy/tasks.py +++ b/apps/proxy/tasks.py @@ -6,8 +6,10 @@ import redis import json import logging import re +import gc # Add import for garbage collection from core.utils import RedisClient from apps.proxy.ts_proxy.channel_status import ChannelStatus +from core.utils import send_websocket_update logger = logging.getLogger(__name__) @@ -43,11 +45,17 @@ def fetch_channel_stats(): return # return JsonResponse({'error': str(e)}, status=500) - channel_layer = get_channel_layer() - async_to_sync(channel_layer.group_send)( + send_websocket_update( "updates", + "update", { - "type": "update", - "data": {"success": True, "type": "channel_stats", "stats": json.dumps({'channels': all_channels, 'count': len(all_channels)})} + "success": True, + "type": "channel_stats", + "stats": json.dumps({'channels': all_channels, 'count': len(all_channels)}) }, + collect_garbage=True ) + + # Explicitly clean up large data structures + all_channels = None + gc.collect() diff --git a/core/tasks.py b/core/tasks.py index fbd9277d..5e60a4e4 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -8,7 +8,7 @@ import logging import re import time import os -from core.utils import RedisClient +from core.utils import RedisClient, send_websocket_update from apps.proxy.ts_proxy.channel_status import ChannelStatus from apps.m3u.models import M3UAccount from apps.epg.models import EPGSource @@ -317,19 +317,24 @@ def fetch_channel_stats(): if cursor == 0: break + send_websocket_update( + "updates", + "update", + { + "success": True, + "type": "channel_stats", + "stats": json.dumps({'channels': all_channels, 'count': len(all_channels)}) + }, + collect_garbage=True + ) + + # Explicitly clean up large data structures + all_channels = None + gc.collect() + except Exception as e: logger.error(f"Error in channel_status: {e}", exc_info=True) return - # return JsonResponse({'error': str(e)}, status=500) - - channel_layer = get_channel_layer() - async_to_sync(channel_layer.group_send)( - "updates", - { - "type": "update", - "data": {"success": True, "type": "channel_stats", "stats": json.dumps({'channels': all_channels, 'count': len(all_channels)})} - }, - ) @shared_task def rehash_streams(keys): diff --git a/core/utils.py b/core/utils.py index 01463ad9..abe1c1f2 100644 --- a/core/utils.py +++ b/core/utils.py @@ -173,15 +173,47 @@ def release_task_lock(task_name, id): # Remove the lock redis_client.delete(lock_id) -def send_websocket_event(event, success, data): +def send_websocket_update(group_name, event_type, data, collect_garbage=False): + """ + Standardized function to send WebSocket updates with proper memory management. + + Args: + group_name: The WebSocket group to send to (e.g. 'updates') + event_type: The type of message (e.g. 'update') + data: The data to send + collect_garbage: Whether to force garbage collection after sending + """ channel_layer = get_channel_layer() - async_to_sync(channel_layer.group_send)( - 'updates', - { - 'type': 'update', - "data": {"success": True, "type": "epg_channels"} - } - ) + try: + async_to_sync(channel_layer.group_send)( + group_name, + { + 'type': event_type, + 'data': data + } + ) + except Exception as e: + logger.warning(f"Failed to send WebSocket update: {e}") + finally: + # Explicitly release references to help garbage collection + channel_layer = None + + # Force garbage collection if requested + if collect_garbage: + gc.collect() + +def send_websocket_event(event, success, data): + """Acquire a lock to prevent concurrent task execution.""" + data_payload = {"success": success, "type": event} + if data: + # Make a copy to avoid modifying the original + data_payload.update(data) + + # Use the standardized function + send_websocket_update('updates', 'update', data_payload) + + # Help garbage collection by clearing references + data_payload = None # Add memory monitoring utilities def get_memory_usage(): From e641cef6f1bc04944313939dbdafddf7695e2cf6 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sun, 18 May 2025 18:28:01 -0500 Subject: [PATCH 06/25] Fixed errors --- apps/epg/tasks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index bc59771b..c66fdff7 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -841,12 +841,14 @@ def parse_programs_for_tvg_id(epg_id): epg = None epg_source = None programs_processed = 0 + mem_before = 0 # Initialize with default value to avoid UnboundLocalError try: # Add memory tracking try: process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 + mem_before = initial_memory # Set mem_before here logger.info(f"[parse_programs_for_tvg_id] Initial memory usage: {initial_memory:.2f} MB") except ImportError: process = None @@ -1042,7 +1044,7 @@ def parse_programs_for_tvg_id(epg_id): except Exception as e: # Just log the error and continue - don't let cleanup errors stop processing - logger.debug(f"Non-critical error during XML element cleanup: {e}") + logger.trace(f"Non-critical error during XML element cleanup: {e}") # Make sure to close the file and release parser resources if source_file: From b84e3f77f3766a0367d84bdf958988dc5c4b7fa2 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sun, 18 May 2025 18:56:30 -0500 Subject: [PATCH 07/25] Fixed custom props not being loaded. --- apps/epg/tasks.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index c66fdff7..17b9faf8 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -975,6 +975,7 @@ def parse_programs_for_tvg_id(epg_id): # Extract custom properties custom_props = extract_custom_properties(elem) + custom_properties_json = None if custom_props: logger.trace(f"Number of custom properties: {len(custom_props)}") @@ -1022,6 +1023,19 @@ def parse_programs_for_tvg_id(epg_id): except Exception as e: logger.error(f"Error processing program for {epg.tvg_id}: {e}", exc_info=True) + else: + # Immediately clean up non-matching elements to reduce memory pressure + elem.clear() + parent = elem.getparent() + if parent is not None: + while elem.getprevious() is not None: + del parent[0] + try: + parent.remove(elem) + except (ValueError, KeyError, TypeError): + pass + del elem + continue # Important: Clear the element to avoid memory leaks using a more robust approach try: From f821743163d996682346baf69c9fbc83bebc4ddc Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sun, 18 May 2025 19:46:52 -0500 Subject: [PATCH 08/25] Created a utility to clean up django memory. --- apps/epg/tasks.py | 11 ++++++++--- core/utils.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 17b9faf8..d808a4c6 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -23,7 +23,7 @@ from asgiref.sync import async_to_sync from channels.layers import get_channel_layer from .models import EPGSource, EPGData, ProgramData -from core.utils import acquire_task_lock, release_task_lock, send_websocket_update +from core.utils import acquire_task_lock, release_task_lock, send_websocket_update, cleanup_memory logger = logging.getLogger(__name__) @@ -826,8 +826,8 @@ def parse_channels_only(source): # Check final memory usage after clearing process gc.collect() - - + # Add comprehensive cleanup at end of channel parsing + cleanup_memory(log_usage=True, force_collection=True) @shared_task @@ -1133,6 +1133,8 @@ def parse_programs_for_tvg_id(epg_id): # Force garbage collection before releasing lock gc.collect() + # Add comprehensive cleanup before releasing lock + cleanup_memory(log_usage=True, force_collection=True) release_task_lock('parse_epg_programs', epg_id) @@ -1281,6 +1283,9 @@ def parse_programs_for_source(epg_source, tvg_id=None): if 'process' in locals() and process is not None: process = None + # Add comprehensive memory cleanup at the end + cleanup_memory(log_usage=True, force_collection=True) + def fetch_schedules_direct(source): logger.info(f"Fetching Schedules Direct data from source: {source.name}") diff --git a/core/utils.py b/core/utils.py index abe1c1f2..7143a349 100644 --- a/core/utils.py +++ b/core/utils.py @@ -245,3 +245,40 @@ def monitor_memory_usage(func): return result return wrapper + +def cleanup_memory(log_usage=True, force_collection=True): + """ + Comprehensive memory cleanup function to reduce memory footprint + + Args: + log_usage: Whether to log memory usage before and after cleanup + force_collection: Whether to force garbage collection + """ + if log_usage: + try: + import psutil + process = psutil.Process() + before_mem = process.memory_info().rss / (1024 * 1024) + logger.debug(f"Memory before cleanup: {before_mem:.2f} MB") + except (ImportError, Exception) as e: + logger.debug(f"Error getting memory usage: {e}") + + # Clear any object caches from Django ORM + from django.db import connection, reset_queries + reset_queries() + + # Force garbage collection + if force_collection: + # Run full collection + gc.collect(generation=2) + # Clear cyclic references + gc.collect(generation=0) + + if log_usage: + try: + import psutil + process = psutil.Process() + after_mem = process.memory_info().rss / (1024 * 1024) + logger.debug(f"Memory after cleanup: {after_mem:.2f} MB (change: {after_mem-before_mem:.2f} MB)") + except (ImportError, Exception): + pass From 7c809931d7d63ff339df98e183cacfacc94944d6 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Sun, 18 May 2025 20:57:37 -0500 Subject: [PATCH 09/25] Rewored celery memory cleanup logic. --- apps/channels/tasks.py | 6 ++--- apps/m3u/tasks.py | 6 +++-- core/tasks.py | 20 ++------------- dispatcharr/celery.py | 56 +++++++++++++++++++++++++++++++++--------- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/apps/channels/tasks.py b/apps/channels/tasks.py index b4de5e07..d59169e5 100755 --- a/apps/channels/tasks.py +++ b/apps/channels/tasks.py @@ -216,9 +216,9 @@ def match_epg_channels(): finally: # Final cleanup gc.collect() - # Force an even more aggressive cleanup - import gc - gc.collect(generation=2) + # Use our standardized cleanup function for more thorough memory management + from core.utils import cleanup_memory + cleanup_memory(log_usage=True, force_collection=True) @shared_task diff --git a/apps/m3u/tasks.py b/apps/m3u/tasks.py index 513c550d..4a1f2645 100644 --- a/apps/m3u/tasks.py +++ b/apps/m3u/tasks.py @@ -496,7 +496,8 @@ def process_m3u_batch(account_id, batch, groups, hash_keys): # Aggressive garbage collection del streams_to_create, streams_to_update, stream_hashes, existing_streams - gc.collect() + from core.utils import cleanup_memory + cleanup_memory(log_usage=True, force_collection=True) return retval @@ -1080,7 +1081,8 @@ def refresh_single_m3u_account(account_id): # Aggressive garbage collection del existing_groups, extinf_data, groups, batches - gc.collect() + from core.utils import cleanup_memory + cleanup_memory(log_usage=True, force_collection=True) # Clean up cache file since we've fully processed it if os.path.exists(cache_path): diff --git a/core/tasks.py b/core/tasks.py index 5e60a4e4..9af367f9 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -49,29 +49,13 @@ def throttled_log(logger_method, message, key=None, *args, **kwargs): def clear_memory(): """Force aggressive garbage collection to free memory""" - import gc - # Run full garbage collection - gc.collect(generation=2) - # Find and break any reference cycles - gc.collect(generation=0) - # Clear any cached objects in memory - gc.collect(generation=1) - # Check if psutil is available for more advanced monitoring - try: - import psutil - process = psutil.Process() - if hasattr(process, 'memory_info'): - mem = process.memory_info().rss / (1024 * 1024) - logger.debug(f"Memory usage after cleanup: {mem:.2f} MB") - except (ImportError, Exception): - pass + from core.utils import cleanup_memory + cleanup_memory(log_usage=True, force_collection=True) @shared_task def beat_periodic_task(): fetch_channel_stats() scan_and_process_files() - # Call memory cleanup after completing tasks - clear_memory() @shared_task def scan_and_process_files(): diff --git a/dispatcharr/celery.py b/dispatcharr/celery.py index b0debc76..855acacd 100644 --- a/dispatcharr/celery.py +++ b/dispatcharr/celery.py @@ -53,19 +53,51 @@ app.conf.update( @task_postrun.connect # Use the imported signal def cleanup_task_memory(**kwargs): """Clean up memory after each task completes""" - import gc - # Force garbage collection - gc.collect() + # Get task name from kwargs + task_name = kwargs.get('task').name if kwargs.get('task') else '' - # Log memory usage if psutil is installed - try: - import psutil - process = psutil.Process() - if hasattr(process, 'memory_info'): - mem = process.memory_info().rss / (1024 * 1024) - print(f"Memory usage after task: {mem:.2f} MB") - except (ImportError, Exception): - pass + # Only run cleanup for memory-intensive tasks + memory_intensive_tasks = [ + 'apps.m3u.tasks.refresh_single_m3u_account', + 'apps.m3u.tasks.refresh_m3u_accounts', + 'apps.m3u.tasks.process_m3u_batch', + 'apps.m3u.tasks.process_xc_category', + 'apps.epg.tasks.refresh_epg_data', + 'apps.epg.tasks.refresh_all_epg_data', + 'apps.epg.tasks.parse_programs_for_source', + 'apps.epg.tasks.parse_programs_for_tvg_id', + 'apps.channels.tasks.match_epg_channels', + 'core.tasks.rehash_streams' + ] + + # Check if this is a memory-intensive task + if task_name in memory_intensive_tasks: + # Import cleanup_memory function + from core.utils import cleanup_memory + + # Use the comprehensive cleanup function + cleanup_memory(log_usage=True, force_collection=True) + + # Log memory usage if psutil is installed + try: + import psutil + process = psutil.Process() + if hasattr(process, 'memory_info'): + mem = process.memory_info().rss / (1024 * 1024) + print(f"Memory usage after {task_name}: {mem:.2f} MB") + except (ImportError, Exception): + pass + else: + # For non-intensive tasks, just log but don't force cleanup + try: + import psutil + process = psutil.Process() + if hasattr(process, 'memory_info'): + mem = process.memory_info().rss / (1024 * 1024) + if mem > 500: # Only log if using more than 500MB + print(f"High memory usage detected in {task_name}: {mem:.2f} MB") + except (ImportError, Exception): + pass @app.on_after_configure.connect def setup_celery_logging(**kwargs): From 6087ecadf0cdd6a2d43b5bb130cfcd9e732458bd Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Mon, 19 May 2025 09:42:21 -0500 Subject: [PATCH 10/25] Cleaning up added gc's --- apps/epg/tasks.py | 107 ++++++++++++++++------------------------------ core/utils.py | 2 + 2 files changed, 40 insertions(+), 69 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index d808a4c6..eb361429 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -536,7 +536,7 @@ def parse_channels_only(source): try: process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"Initial memory usage: {initial_memory:.2f} MB") + logger.info(f"[parse_channels_only] Initial memory usage: {initial_memory:.2f} MB") except (ImportError, NameError): process = None logger.warning("psutil not available for memory tracking") @@ -574,7 +574,7 @@ def parse_channels_only(source): # Track memory at key points if process: - logger.info(f"Memory before opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory before opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") try: # Create a parser with the desired options @@ -585,7 +585,7 @@ def parse_channels_only(source): logger.info(f"Opening file for initial channel count: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') if process: - logger.info(f"Memory after opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Count channels try: @@ -599,7 +599,7 @@ def parse_channels_only(source): logger.info(f"Closing initial file handle") source_file.close() if process: - logger.info(f"Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update progress after counting send_epg_update(source.id, "parsing_channels", 25, total_channels=total_channels) @@ -608,12 +608,12 @@ def parse_channels_only(source): logger.info(f"Re-opening file for channel parsing: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') if process: - logger.info(f"Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") logger.info(f"Creating iterparse context") channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') if process: - logger.info(f"Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") channel_count = 0 for _, elem in channel_parser: @@ -663,12 +663,12 @@ def parse_channels_only(source): logger.info(f"Bulk creating {len(epgs_to_create)} EPG entries") EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) if process: - logger.info(f"Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") del epgs_to_create # Explicit deletion epgs_to_create = [] gc.collect() if process: - logger.info(f"Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") if len(epgs_to_update) >= batch_size: EPGData.objects.bulk_update(epgs_to_update, ["name"]) @@ -682,7 +682,7 @@ def parse_channels_only(source): existing_epgs.clear() gc.collect() if process: - logger.info(f"Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Send progress updates if processed_channels % 100 == 0 or processed_channels == total_channels: @@ -715,6 +715,7 @@ def parse_channels_only(source): except (ValueError, KeyError, TypeError): # Element might already be removed or detached pass + cleanup_memory(log_usage=True, force_collection=True) except Exception as e: # Just log the error and continue - don't let cleanup errors stop processing @@ -728,7 +729,7 @@ def parse_channels_only(source): # Explicit cleanup before sleeping logger.info(f"Completed channel parsing loop, processed {processed_channels} channels") if process: - logger.info(f"Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Explicit cleanup of the parser del channel_parser @@ -742,7 +743,7 @@ def parse_channels_only(source): # Force garbage collection gc.collect() if process: - logger.info(f"Memory after final cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after final cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Remove long sleep that might be causing issues # time.sleep(200) # This seems excessive and may be causing issues @@ -767,10 +768,10 @@ def parse_channels_only(source): # Final garbage collection and memory tracking if process: - logger.info(f"Memory before final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory before final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") gc.collect() if process: - logger.info(f"Memory after final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.info(f"[parse_channels_only] Memory after final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update source status with channel count source.status = 'success' @@ -814,20 +815,19 @@ def parse_channels_only(source): logger.info("In finally block, ensuring cleanup") existing_tvg_ids = None existing_epgs = None - try: - process = psutil.Process() - final_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"Final memory usage: {final_memory:.2f} MB") - except: - pass - # Explicitly clear the process object to prevent potential memory leaks - if 'process' in locals() and process is not None: - process = None # Check final memory usage after clearing process gc.collect() # Add comprehensive cleanup at end of channel parsing cleanup_memory(log_usage=True, force_collection=True) + try: + process = psutil.Process() + final_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_channels_only] Final memory usage: {final_memory:.2f} MB") + process = None + except: + pass + @shared_task @@ -943,9 +943,6 @@ def parse_programs_for_tvg_id(epg_id): programs_processed = 0 try: - # Create a parser with the desired options - #parser = etree.XMLParser(huge_tree=True, remove_blank_text=True) - # Open the file properly source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') @@ -1093,48 +1090,38 @@ def parse_programs_for_tvg_id(epg_id): programs_to_create = None custom_props = None custom_properties_json = None - #del programs_to_create - #programs_to_create = [] - # Final garbage collection - gc.collect() - - # One additional garbage collection specifically for lxml elements - # which can sometimes be retained due to reference cycles - gc.collect() - + logger.info(f"Completed program parsing for tvg_id={epg.tvg_id}.") + finally: # Reset internal caches and pools that lxml might be keeping try: etree.clear_error_log() except: pass - - logger.info(f"Completed program parsing for tvg_id={epg.tvg_id}.") - finally: # Explicit cleanup of all potentially large objects if source_file: try: source_file.close() except: pass - # Memory tracking after processing - if process: - mem_after = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_programs_for_tvg_id] Memory after parsing 2 {epg.tvg_id} - {programs_processed} programs: {mem_after:.2f} MB (change: {mem_after-mem_before:.2f} MB)") source_file = None program_parser = None programs_to_create = None - epg = None + epg_source = None - # Explicitly clear the process object to prevent potential memory leaks - if 'process' in locals() and process is not None: - process = None # Force garbage collection before releasing lock gc.collect() # Add comprehensive cleanup before releasing lock cleanup_memory(log_usage=True, force_collection=True) + # Memory tracking after processing + if process: + mem_after = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Final memory usage {epg.tvg_id} - {programs_processed} programs: {mem_after:.2f} MB (change: {mem_after-mem_before:.2f} MB)") + process = None + epg = None + programs_processed = None release_task_lock('parse_epg_programs', epg_id) @@ -1172,11 +1159,6 @@ def parse_programs_for_source(epg_source, tvg_id=None): channel_count = 0 updated_count = 0 processed = 0 - - # Memory check before batch processing - if process: - logger.info(f"[parse_programs_for_source] Memory before batch processing: {process.memory_info().rss / 1024 / 1024:.2f} MB") - # Process in batches using cursor-based approach to limit memory usage last_id = 0 while True: @@ -1207,18 +1189,10 @@ def parse_programs_for_source(epg_source, tvg_id=None): logger.error(f"Error parsing programs for tvg_id={epg.tvg_id}: {e}", exc_info=True) failed_entries.append(f"{epg.tvg_id}: {str(e)}") - # Memory check after processing batch - if process: - logger.info(f"[parse_programs_for_source] Memory after processing batch: {process.memory_info().rss / 1024 / 1024:.2f} MB") - # Force garbage collection after each batch batch_entries = None # Remove reference to help garbage collection gc.collect() - # Memory check after garbage collection - if process: - logger.info(f"[parse_programs_for_source] Memory after gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") - # If there were failures, include them in the message but continue if failed_entries: epg_source.status = EPGSource.STATUS_SUCCESS # Still mark as success if some processed @@ -1265,11 +1239,7 @@ def parse_programs_for_source(epg_source, tvg_id=None): return False finally: # Final memory cleanup and tracking - if process: - # Force garbage collection before measuring - gc.collect() - final_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_programs_for_source] Final memory usage: {final_memory:.2f} MB") + # Explicitly release any remaining large data structures failed_entries = None @@ -1279,14 +1249,13 @@ def parse_programs_for_source(epg_source, tvg_id=None): processed = None gc.collect() - # Explicitly clear the process object to prevent potential memory leaks - if 'process' in locals() and process is not None: - process = None - # Add comprehensive memory cleanup at the end cleanup_memory(log_usage=True, force_collection=True) - - + if process: + final_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_source] Final memory usage: {final_memory:.2f} MB difference: {final_memory - initial_memory:.2f} MB") + # Explicitly clear the process object to prevent potential memory leaks + process = None def fetch_schedules_direct(source): logger.info(f"Fetching Schedules Direct data from source: {source.name}") try: diff --git a/core/utils.py b/core/utils.py index 7143a349..1dabf3a6 100644 --- a/core/utils.py +++ b/core/utils.py @@ -254,6 +254,7 @@ def cleanup_memory(log_usage=True, force_collection=True): log_usage: Whether to log memory usage before and after cleanup force_collection: Whether to force garbage collection """ + logger.debug("Starting memory cleanup django memory cleanup") if log_usage: try: import psutil @@ -282,3 +283,4 @@ def cleanup_memory(log_usage=True, force_collection=True): logger.debug(f"Memory after cleanup: {after_mem:.2f} MB (change: {after_mem-before_mem:.2f} MB)") except (ImportError, Exception): pass + logger.debug("Memory cleanup complete for django") From eb223e1df2ccff63c96bbc00af6292924169d7e8 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Mon, 19 May 2025 09:53:52 -0500 Subject: [PATCH 11/25] Enable logging for core utils. --- dispatcharr/settings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dispatcharr/settings.py b/dispatcharr/settings.py index de8464d5..4e1e0d55 100644 --- a/dispatcharr/settings.py +++ b/dispatcharr/settings.py @@ -321,6 +321,11 @@ LOGGING = { 'level': LOG_LEVEL, # Use environment-configured level 'propagate': False, # Don't propagate to root logger to avoid duplicate logs }, + 'core.utils': { + 'handlers': ['console'], + 'level': LOG_LEVEL, + 'propagate': False, + }, 'apps.proxy': { 'handlers': ['console'], 'level': LOG_LEVEL, # Use environment-configured level From 06b1dec2b60ffeb2d54b5a45a86091376769fa4e Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Mon, 19 May 2025 10:02:42 -0500 Subject: [PATCH 12/25] Better logic for cleanup task. Skip gathering memory if we aren't going to log it anyway. --- core/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/core/utils.py b/core/utils.py index 1dabf3a6..d26b0dd3 100644 --- a/core/utils.py +++ b/core/utils.py @@ -254,7 +254,10 @@ def cleanup_memory(log_usage=True, force_collection=True): log_usage: Whether to log memory usage before and after cleanup force_collection: Whether to force garbage collection """ - logger.debug("Starting memory cleanup django memory cleanup") + logger.trace("Starting memory cleanup django memory cleanup") + # Skip logging if log level is not set to debug (no reason to run memory usage if we don't log it) + if not logger.isEnabledFor(logging.DEBUG): + log_usage = False if log_usage: try: import psutil @@ -283,4 +286,4 @@ def cleanup_memory(log_usage=True, force_collection=True): logger.debug(f"Memory after cleanup: {after_mem:.2f} MB (change: {after_mem-before_mem:.2f} MB)") except (ImportError, Exception): pass - logger.debug("Memory cleanup complete for django") + logger.trace("Memory cleanup complete for django") From 8bf093d79b52d7bdac9702266ebaecb409740eba Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 13:53:02 -0500 Subject: [PATCH 13/25] Increased logging and cleanup --- apps/epg/tasks.py | 72 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index eb361429..dfb290c0 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -616,7 +616,9 @@ def parse_channels_only(source): logger.info(f"[parse_channels_only] Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") channel_count = 0 + total_elements_processed = 0 # Track total elements processed, not just channels for _, elem in channel_parser: + total_elements_processed += 1 channel_count += 1 tvg_id = elem.get('id', '').strip() if tvg_id: @@ -660,7 +662,7 @@ def parse_channels_only(source): # Batch processing if len(epgs_to_create) >= batch_size: - logger.info(f"Bulk creating {len(epgs_to_create)} EPG entries") + logger.info(f"[parse_channels_only] Bulk creating {len(epgs_to_create)} EPG entries") EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) if process: logger.info(f"[parse_channels_only] Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") @@ -678,7 +680,7 @@ def parse_channels_only(source): # Periodically clear the existing_epgs cache to prevent memory buildup if processed_channels % 1000 == 0: - logger.info(f"Clearing existing_epgs cache at {processed_channels} channels") + logger.info(f"[parse_channels_only] Clearing existing_epgs cache at {processed_channels} channels") existing_epgs.clear() gc.collect() if process: @@ -694,7 +696,7 @@ def parse_channels_only(source): processed=processed_channels, total=total_channels ) - logger.debug(f"Processed channel: {tvg_id} - {display_name}") + logger.debug(f"[parse_channels_only] Processed channel: {tvg_id} - {display_name}") # Clear memory try: # First clear the element's content @@ -719,26 +721,72 @@ def parse_channels_only(source): except Exception as e: # Just log the error and continue - don't let cleanup errors stop processing - logger.debug(f"Non-critical error during XML element cleanup: {e}") + logger.debug(f"[parse_channels_only] Non-critical error during XML element cleanup: {e}") # Check if we should break early to avoid excessive sleep if processed_channels >= total_channels and total_channels > 0: - logger.info(f"Breaking channel processing loop - processed {processed_channels}/{total_channels}") + logger.info(f"[parse_channels_only] Expected channel numbers hit, continuing - processed {processed_channels}/{total_channels}") + logger.debug(f"[parse_channels_only] Memory usage after {processed_channels}: {process.memory_info().rss / 1024 / 1024:.2f} MB") break + logger.debug(f"[parse_channels_only] Total elements processed: {total_elements_processed}") + # Add periodic forced cleanup based on TOTAL ELEMENTS, not just channels + # This ensures we clean up even if processing many non-channel elements + if total_elements_processed % 1000 == 0: + logger.info(f"[parse_channels_only] Performing preventative memory cleanup after {total_elements_processed} elements (found {processed_channels} channels)") + # Close and reopen the parser to release memory + if source_file and channel_parser: + # First clear element references + elem.clear() + if elem.getparent() is not None: + elem.getparent().remove(elem) + + # Reset parser state + del channel_parser + channel_parser = None + gc.collect() + + # Perform thorough cleanup + cleanup_memory(log_usage=True, force_collection=True) + + # Create a new parser context + channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') + logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") + + # Also do cleanup based on processed channels as before + elif processed_channels % 1000 == 0 and processed_channels > 0: + logger.info(f"[parse_channels_only] Performing preventative memory cleanup at {processed_channels} channels") + # Close and reopen the parser to release memory + if source_file and channel_parser: + # First clear element references + elem.clear() + if elem.getparent() is not None: + elem.getparent().remove(elem) + + # Reset parser state + del channel_parser + channel_parser = None + gc.collect() + + # Perform thorough cleanup + cleanup_memory(log_usage=True, force_collection=True) + + # Create a new parser context + channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') + logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") # Explicit cleanup before sleeping - logger.info(f"Completed channel parsing loop, processed {processed_channels} channels") + logger.info(f"[parse_channels_only] Completed channel parsing loop, processed {processed_channels} channels") if process: logger.info(f"[parse_channels_only] Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Explicit cleanup of the parser del channel_parser - logger.info(f"Deleted channel_parser object") + logger.info(f"[parse_channels_only] Deleted channel_parser object") # Close the file - logger.info(f"Closing file: {file_path}") + logger.info(f"[parse_channels_only] Closing file: {file_path}") source_file.close() - logger.info(f"File closed: {file_path}") + logger.info(f"[parse_channels_only] File closed: {file_path}") # Force garbage collection gc.collect() @@ -749,7 +797,7 @@ def parse_channels_only(source): # time.sleep(200) # This seems excessive and may be causing issues except (etree.XMLSyntaxError, Exception) as xml_error: - logger.error(f"XML parsing failed: {xml_error}") + logger.error(f"[parse_channels_only] XML parsing failed: {xml_error}") # Update status to error source.status = 'error' source.last_message = f"Error parsing XML file: {str(xml_error)}" @@ -760,11 +808,11 @@ def parse_channels_only(source): # Process any remaining items if epgs_to_create: EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) - logger.info(f"Created final batch of {len(epgs_to_create)} EPG entries") + logger.info(f"[parse_channels_only] Created final batch of {len(epgs_to_create)} EPG entries") if epgs_to_update: EPGData.objects.bulk_update(epgs_to_update, ["name"]) - logger.info(f"Updated final batch of {len(epgs_to_update)} EPG entries") + logger.info(f"[parse_channels_only] Updated final batch of {len(epgs_to_update)} EPG entries") # Final garbage collection and memory tracking if process: From cc060bbed6330465deeb1788f787a37d1f7250a3 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 14:49:29 -0500 Subject: [PATCH 14/25] More memory logging to debug. --- apps/epg/tasks.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index dfb290c0..f57820f1 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -673,7 +673,12 @@ def parse_channels_only(source): logger.info(f"[parse_channels_only] Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") if len(epgs_to_update) >= batch_size: + logger.info(f"[parse_channels_only] Bulk updating {len(epgs_to_update)} EPG entries") + if process: + logger.info(f"[parse_channels_only] Memory before bulk_update: {process.memory_info().rss / 1024 / 1024:.2f} MB") EPGData.objects.bulk_update(epgs_to_update, ["name"]) + if process: + logger.info(f"[parse_channels_only] Memory after bulk_update: {process.memory_info().rss / 1024 / 1024:.2f} MB") epgs_to_update = [] # Force garbage collection gc.collect() @@ -697,6 +702,8 @@ def parse_channels_only(source): total=total_channels ) logger.debug(f"[parse_channels_only] Processed channel: {tvg_id} - {display_name}") + if process: + logger.info(f"[parse_channels_only] Memory before elem cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Clear memory try: # First clear the element's content @@ -718,16 +725,17 @@ def parse_channels_only(source): # Element might already be removed or detached pass cleanup_memory(log_usage=True, force_collection=True) - + time.sleep(.1) except Exception as e: # Just log the error and continue - don't let cleanup errors stop processing logger.debug(f"[parse_channels_only] Non-critical error during XML element cleanup: {e}") - + if process: + logger.info(f"[parse_channels_only] Memory after elem cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Check if we should break early to avoid excessive sleep if processed_channels >= total_channels and total_channels > 0: logger.info(f"[parse_channels_only] Expected channel numbers hit, continuing - processed {processed_channels}/{total_channels}") logger.debug(f"[parse_channels_only] Memory usage after {processed_channels}: {process.memory_info().rss / 1024 / 1024:.2f} MB") - break + #break logger.debug(f"[parse_channels_only] Total elements processed: {total_elements_processed}") # Add periodic forced cleanup based on TOTAL ELEMENTS, not just channels # This ensures we clean up even if processing many non-channel elements @@ -773,14 +781,18 @@ def parse_channels_only(source): # Create a new parser context channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") - + if process: + logger.info(f"[parse_channels_only] Memory after leaving for loop: {process.memory_info().rss / 1024 / 1024:.2f} MB") + time.sleep(20) # Explicit cleanup before sleeping logger.info(f"[parse_channels_only] Completed channel parsing loop, processed {processed_channels} channels") if process: logger.info(f"[parse_channels_only] Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Explicit cleanup of the parser - del channel_parser + + #del channel_parser + logger.info(f"[parse_channels_only] Deleted channel_parser object") # Close the file @@ -804,6 +816,8 @@ def parse_channels_only(source): source.save(update_fields=['status', 'last_message']) send_epg_update(source.id, "parsing_channels", 100, status="error", error=str(xml_error)) return False + if process: + logger.info(f"[parse_channels_only] Memory before final batch creation: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Process any remaining items if epgs_to_create: @@ -813,6 +827,8 @@ def parse_channels_only(source): if epgs_to_update: EPGData.objects.bulk_update(epgs_to_update, ["name"]) logger.info(f"[parse_channels_only] Updated final batch of {len(epgs_to_update)} EPG entries") + if process: + logger.info(f"[parse_channels_only] Memory after final batch creation: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Final garbage collection and memory tracking if process: From d52ff40db1f9c2771c4c8186a53766a4223fdebc Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 15:15:00 -0500 Subject: [PATCH 15/25] More logging --- apps/epg/tasks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index f57820f1..fd554076 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -569,7 +569,7 @@ def parse_channels_only(source): epgs_to_update = [] total_channels = 0 processed_channels = 0 - batch_size = 500 # Process in batches to limit memory usage + batch_size = 10 # Process in batches to limit memory usage progress = 0 # Initialize progress variable here # Track memory at key points @@ -645,18 +645,21 @@ def parse_channels_only(source): name=display_name, epg_source=source, )) + logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 1: {tvg_id} - {display_name}") continue epg_obj = existing_epgs[tvg_id] if epg_obj.name != display_name: epg_obj.name = display_name epgs_to_update.append(epg_obj) + logger.debug(f"[parse_channels_only] Added channel to update to epgs_to_update: {tvg_id} - {display_name}") else: epgs_to_create.append(EPGData( tvg_id=tvg_id, name=display_name, epg_source=source, )) + logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 2: {tvg_id} - {display_name}") processed_channels += 1 From 7d6ef38bce0c315141a6e374963e868a9ffb8a60 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 15:34:03 -0500 Subject: [PATCH 16/25] Added helpful comments. --- apps/epg/tasks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index fd554076..031e8965 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -569,7 +569,7 @@ def parse_channels_only(source): epgs_to_update = [] total_channels = 0 processed_channels = 0 - batch_size = 10 # Process in batches to limit memory usage + batch_size = 500 # Process in batches to limit memory usage progress = 0 # Initialize progress variable here # Track memory at key points @@ -636,6 +636,7 @@ def parse_channels_only(source): # Only fetch the object if we need to update it and it hasn't been loaded yet if tvg_id not in existing_epgs: try: + # This loads the full EPG object from the database and caches it existing_epgs[tvg_id] = EPGData.objects.get(tvg_id=tvg_id, epg_source=source) except EPGData.DoesNotExist: # Handle race condition where record was deleted @@ -648,12 +649,15 @@ def parse_channels_only(source): logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 1: {tvg_id} - {display_name}") continue + # We use the cached object to check if the name has changed epg_obj = existing_epgs[tvg_id] if epg_obj.name != display_name: + # Only update if the name actually changed epg_obj.name = display_name epgs_to_update.append(epg_obj) logger.debug(f"[parse_channels_only] Added channel to update to epgs_to_update: {tvg_id} - {display_name}") else: + # This is a new channel that doesn't exist in our database epgs_to_create.append(EPGData( tvg_id=tvg_id, name=display_name, @@ -687,7 +691,7 @@ def parse_channels_only(source): gc.collect() # Periodically clear the existing_epgs cache to prevent memory buildup - if processed_channels % 1000 == 0: + if processed_channels % 100 == 0: logger.info(f"[parse_channels_only] Clearing existing_epgs cache at {processed_channels} channels") existing_epgs.clear() gc.collect() From ae823ae8eaa89d724b014eb8deb2126a68bb67ec Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 15:48:21 -0500 Subject: [PATCH 17/25] Even more debug logging. --- apps/epg/tasks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 031e8965..9153485f 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -691,7 +691,7 @@ def parse_channels_only(source): gc.collect() # Periodically clear the existing_epgs cache to prevent memory buildup - if processed_channels % 100 == 0: + if processed_channels % 1000 == 0: logger.info(f"[parse_channels_only] Clearing existing_epgs cache at {processed_channels} channels") existing_epgs.clear() gc.collect() @@ -733,6 +733,7 @@ def parse_channels_only(source): pass cleanup_memory(log_usage=True, force_collection=True) time.sleep(.1) + except Exception as e: # Just log the error and continue - don't let cleanup errors stop processing logger.debug(f"[parse_channels_only] Non-critical error during XML element cleanup: {e}") @@ -788,6 +789,11 @@ def parse_channels_only(source): # Create a new parser context channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") + + if processed_channels == total_channels: + logger.info(f"[parse_channels_only] Processed all channels current memory: {process.memory_info().rss / 1024 / 1024:.2f} MB") + + if process: logger.info(f"[parse_channels_only] Memory after leaving for loop: {process.memory_info().rss / 1024 / 1024:.2f} MB") time.sleep(20) From cfad5621ce4328d0975544ad2e2c3d81ff896bf1 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 18:43:43 -0500 Subject: [PATCH 18/25] Found the problem. I hate lxml. --- apps/epg/tasks.py | 434 +++++++++++++++++++++++----------------------- 1 file changed, 220 insertions(+), 214 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 9153485f..f605abc6 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -532,11 +532,20 @@ def parse_channels_only(source): source.last_message = f"No URL provided, cannot fetch EPG data" source.save(update_fields=['updated_at']) - # Initialize process variable for memory tracking + # Initialize process variable for memory tracking only in debug mode try: - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_channels_only] Initial memory usage: {initial_memory:.2f} MB") + process = None + # Get current log level as a number rather than string + current_log_level = logger.getEffectiveLevel() + + # Only track memory usage when log level is DEBUG (10) or more verbose + # This is more future-proof than string comparisons + if current_log_level <= logging.DEBUG or settings.DEBUG: + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.debug(f"[parse_channels_only] Initial memory usage: {initial_memory:.2f} MB") + else: + logger.debug("Memory tracking disabled in production mode") except (ImportError, NameError): process = None logger.warning("psutil not available for memory tracking") @@ -596,230 +605,219 @@ def parse_channels_only(source): total_channels = 500 # Default estimate # Close the file to reset position - logger.info(f"Closing initial file handle") + logger.debug(f"Closing initial file handle") source_file.close() if process: - logger.info(f"[parse_channels_only] Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.debug(f"[parse_channels_only] Memory after closing initial file: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update progress after counting send_epg_update(source.id, "parsing_channels", 25, total_channels=total_channels) # Reset file position for actual processing - logger.info(f"Re-opening file for channel parsing: {file_path}") + logger.debug(f"Re-opening file for channel parsing: {file_path}") source_file = gzip.open(file_path, 'rb') if is_gzipped else open(file_path, 'rb') if process: - logger.info(f"[parse_channels_only] Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.debug(f"[parse_channels_only] Memory after re-opening file: {process.memory_info().rss / 1024 / 1024:.2f} MB") - logger.info(f"Creating iterparse context") - channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') + # Change iterparse to look for both channel and programme elements + logger.debug(f"Creating iterparse context for channels and programmes") + channel_parser = etree.iterparse(source_file, events=('end',), tag=('channel', 'programme')) if process: - logger.info(f"[parse_channels_only] Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.debug(f"[parse_channels_only] Memory after creating iterparse: {process.memory_info().rss / 1024 / 1024:.2f} MB") channel_count = 0 total_elements_processed = 0 # Track total elements processed, not just channels for _, elem in channel_parser: total_elements_processed += 1 - channel_count += 1 - tvg_id = elem.get('id', '').strip() - if tvg_id: - display_name = None - for child in elem: - if child.tag == 'display-name' and child.text: - display_name = child.text.strip() - break - if not display_name: - display_name = tvg_id - - # Use lazy loading approach to reduce memory usage - if tvg_id in existing_tvg_ids: - # Only fetch the object if we need to update it and it hasn't been loaded yet - if tvg_id not in existing_epgs: - try: - # This loads the full EPG object from the database and caches it - existing_epgs[tvg_id] = EPGData.objects.get(tvg_id=tvg_id, epg_source=source) - except EPGData.DoesNotExist: - # Handle race condition where record was deleted - existing_tvg_ids.remove(tvg_id) - epgs_to_create.append(EPGData( - tvg_id=tvg_id, - name=display_name, - epg_source=source, - )) - logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 1: {tvg_id} - {display_name}") - continue - - # We use the cached object to check if the name has changed - epg_obj = existing_epgs[tvg_id] - if epg_obj.name != display_name: - # Only update if the name actually changed - epg_obj.name = display_name - epgs_to_update.append(epg_obj) - logger.debug(f"[parse_channels_only] Added channel to update to epgs_to_update: {tvg_id} - {display_name}") - else: - # This is a new channel that doesn't exist in our database - epgs_to_create.append(EPGData( - tvg_id=tvg_id, - name=display_name, - epg_source=source, - )) - logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 2: {tvg_id} - {display_name}") - - processed_channels += 1 - - # Batch processing - if len(epgs_to_create) >= batch_size: - logger.info(f"[parse_channels_only] Bulk creating {len(epgs_to_create)} EPG entries") - EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) - if process: - logger.info(f"[parse_channels_only] Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") - del epgs_to_create # Explicit deletion - epgs_to_create = [] - gc.collect() - if process: - logger.info(f"[parse_channels_only] Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") - - if len(epgs_to_update) >= batch_size: - logger.info(f"[parse_channels_only] Bulk updating {len(epgs_to_update)} EPG entries") - if process: - logger.info(f"[parse_channels_only] Memory before bulk_update: {process.memory_info().rss / 1024 / 1024:.2f} MB") - EPGData.objects.bulk_update(epgs_to_update, ["name"]) - if process: - logger.info(f"[parse_channels_only] Memory after bulk_update: {process.memory_info().rss / 1024 / 1024:.2f} MB") - epgs_to_update = [] - # Force garbage collection - gc.collect() - - # Periodically clear the existing_epgs cache to prevent memory buildup - if processed_channels % 1000 == 0: - logger.info(f"[parse_channels_only] Clearing existing_epgs cache at {processed_channels} channels") - existing_epgs.clear() - gc.collect() - if process: - logger.info(f"[parse_channels_only] Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") - - # Send progress updates - if processed_channels % 100 == 0 or processed_channels == total_channels: - progress = 25 + int((processed_channels / total_channels) * 65) if total_channels > 0 else 90 - send_epg_update( - source.id, - "parsing_channels", - progress, - processed=processed_channels, - total=total_channels - ) - logger.debug(f"[parse_channels_only] Processed channel: {tvg_id} - {display_name}") - if process: - logger.info(f"[parse_channels_only] Memory before elem cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") - # Clear memory - try: - # First clear the element's content + # If we encounter a programme element, we've processed all channels + # Break out of the loop to avoid memory spike + if elem.tag == 'programme': + logger.debug(f"[parse_channels_only] Found first programme element after processing {processed_channels} channels - exiting channel parsing") + # Clean up the element before breaking elem.clear() - - # Get the parent before we might lose reference to it parent = elem.getparent() if parent is not None: - # Clean up preceding siblings - while elem.getprevious() is not None: - del parent[0] + parent.remove(elem) + break - # Try to fully detach this element from parent - try: - parent.remove(elem) - del elem - del parent - except (ValueError, KeyError, TypeError): - # Element might already be removed or detached - pass - cleanup_memory(log_usage=True, force_collection=True) - time.sleep(.1) + # Only process channel elements + if elem.tag == 'channel': + channel_count += 1 + tvg_id = elem.get('id', '').strip() + if tvg_id: + display_name = None + for child in elem: + if child.tag == 'display-name' and child.text: + display_name = child.text.strip() + break - except Exception as e: - # Just log the error and continue - don't let cleanup errors stop processing - logger.debug(f"[parse_channels_only] Non-critical error during XML element cleanup: {e}") - if process: - logger.info(f"[parse_channels_only] Memory after elem cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") - # Check if we should break early to avoid excessive sleep - if processed_channels >= total_channels and total_channels > 0: - logger.info(f"[parse_channels_only] Expected channel numbers hit, continuing - processed {processed_channels}/{total_channels}") - logger.debug(f"[parse_channels_only] Memory usage after {processed_channels}: {process.memory_info().rss / 1024 / 1024:.2f} MB") - #break - logger.debug(f"[parse_channels_only] Total elements processed: {total_elements_processed}") - # Add periodic forced cleanup based on TOTAL ELEMENTS, not just channels - # This ensures we clean up even if processing many non-channel elements - if total_elements_processed % 1000 == 0: - logger.info(f"[parse_channels_only] Performing preventative memory cleanup after {total_elements_processed} elements (found {processed_channels} channels)") - # Close and reopen the parser to release memory - if source_file and channel_parser: - # First clear element references - elem.clear() - if elem.getparent() is not None: - elem.getparent().remove(elem) + if not display_name: + display_name = tvg_id - # Reset parser state - del channel_parser - channel_parser = None + # Use lazy loading approach to reduce memory usage + if tvg_id in existing_tvg_ids: + # Only fetch the object if we need to update it and it hasn't been loaded yet + if tvg_id not in existing_epgs: + try: + # This loads the full EPG object from the database and caches it + existing_epgs[tvg_id] = EPGData.objects.get(tvg_id=tvg_id, epg_source=source) + except EPGData.DoesNotExist: + # Handle race condition where record was deleted + existing_tvg_ids.remove(tvg_id) + epgs_to_create.append(EPGData( + tvg_id=tvg_id, + name=display_name, + epg_source=source, + )) + logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 1: {tvg_id} - {display_name}") + continue + + # We use the cached object to check if the name has changed + epg_obj = existing_epgs[tvg_id] + if epg_obj.name != display_name: + # Only update if the name actually changed + epg_obj.name = display_name + epgs_to_update.append(epg_obj) + logger.debug(f"[parse_channels_only] Added channel to update to epgs_to_update: {tvg_id} - {display_name}") + else: + # This is a new channel that doesn't exist in our database + epgs_to_create.append(EPGData( + tvg_id=tvg_id, + name=display_name, + epg_source=source, + )) + logger.debug(f"[parse_channels_only] Added new channel to epgs_to_create 2: {tvg_id} - {display_name}") + + processed_channels += 1 + + # Batch processing + if len(epgs_to_create) >= batch_size: + logger.info(f"[parse_channels_only] Bulk creating {len(epgs_to_create)} EPG entries") + EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) + if process: + logger.info(f"[parse_channels_only] Memory after bulk_create: {process.memory_info().rss / 1024 / 1024:.2f} MB") + del epgs_to_create # Explicit deletion + epgs_to_create = [] + gc.collect() + if process: + logger.info(f"[parse_channels_only] Memory after gc.collect(): {process.memory_info().rss / 1024 / 1024:.2f} MB") + + if len(epgs_to_update) >= batch_size: + logger.info(f"[parse_channels_only] Bulk updating {len(epgs_to_update)} EPG entries") + if process: + logger.info(f"[parse_channels_only] Memory before bulk_update: {process.memory_info().rss / 1024 / 1024:.2f} MB") + EPGData.objects.bulk_update(epgs_to_update, ["name"]) + if process: + logger.info(f"[parse_channels_only] Memory after bulk_update: {process.memory_info().rss / 1024 / 1024:.2f} MB") + epgs_to_update = [] + # Force garbage collection gc.collect() - # Perform thorough cleanup - cleanup_memory(log_usage=True, force_collection=True) - - # Create a new parser context - channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') - logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") - - # Also do cleanup based on processed channels as before - elif processed_channels % 1000 == 0 and processed_channels > 0: - logger.info(f"[parse_channels_only] Performing preventative memory cleanup at {processed_channels} channels") - # Close and reopen the parser to release memory - if source_file and channel_parser: - # First clear element references - elem.clear() - if elem.getparent() is not None: - elem.getparent().remove(elem) - - # Reset parser state - del channel_parser - channel_parser = None + # Periodically clear the existing_epgs cache to prevent memory buildup + if processed_channels % 1000 == 0: + logger.info(f"[parse_channels_only] Clearing existing_epgs cache at {processed_channels} channels") + existing_epgs.clear() gc.collect() + if process: + logger.info(f"[parse_channels_only] Memory after clearing cache: {process.memory_info().rss / 1024 / 1024:.2f} MB") - # Perform thorough cleanup + # Send progress updates + if processed_channels % 100 == 0 or processed_channels == total_channels: + progress = 25 + int((processed_channels / total_channels) * 65) if total_channels > 0 else 90 + send_epg_update( + source.id, + "parsing_channels", + progress, + processed=processed_channels, + total=total_channels + ) + logger.debug(f"[parse_channels_only] Processed channel: {tvg_id} - {display_name}") + if process: + logger.info(f"[parse_channels_only] Memory before elem cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # Clear memory + try: + # First clear the element's content + elem.clear() + + # Get the parent before we might lose reference to it + parent = elem.getparent() + if parent is not None: + # Clean up preceding siblings + while elem.getprevious() is not None: + del parent[0] + + # Try to fully detach this element from parent + try: + parent.remove(elem) + del elem + del parent + except (ValueError, KeyError, TypeError): + # Element might already be removed or detached + pass cleanup_memory(log_usage=True, force_collection=True) + #time.sleep(.1) - # Create a new parser context - channel_parser = etree.iterparse(source_file, events=('end',), tag='channel') - logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") + except Exception as e: + # Just log the error and continue - don't let cleanup errors stop processing + logger.debug(f"[parse_channels_only] Non-critical error during XML element cleanup: {e}") + if process: + logger.info(f"[parse_channels_only] Memory after elem cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # Check if we should break early to avoid excessive sleep + if processed_channels >= total_channels and total_channels > 0: + logger.info(f"[parse_channels_only] Expected channel numbers hit, continuing - processed {processed_channels}/{total_channels}") + logger.debug(f"[parse_channels_only] Memory usage after {processed_channels}: {process.memory_info().rss / 1024 / 1024:.2f} MB") + #break + logger.debug(f"[parse_channels_only] Total elements processed: {total_elements_processed}") + # Add periodic forced cleanup based on TOTAL ELEMENTS, not just channels + # This ensures we clean up even if processing many non-channel elements + if total_elements_processed % 1000 == 0: + logger.info(f"[parse_channels_only] Performing preventative memory cleanup after {total_elements_processed} elements (found {processed_channels} channels)") + # Close and reopen the parser to release memory + if source_file and channel_parser: + # First clear element references + elem.clear() + if elem.getparent() is not None: + elem.getparent().remove(elem) - if processed_channels == total_channels: - logger.info(f"[parse_channels_only] Processed all channels current memory: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # Reset parser state + del channel_parser + channel_parser = None + gc.collect() + # Perform thorough cleanup + cleanup_memory(log_usage=True, force_collection=True) - if process: - logger.info(f"[parse_channels_only] Memory after leaving for loop: {process.memory_info().rss / 1024 / 1024:.2f} MB") - time.sleep(20) - # Explicit cleanup before sleeping - logger.info(f"[parse_channels_only] Completed channel parsing loop, processed {processed_channels} channels") - if process: - logger.info(f"[parse_channels_only] Memory before cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") + # Create a new parser context - continue looking for both tags + # This doesn't restart from the beginning but continues from current position + channel_parser = etree.iterparse(source_file, events=('end',), tag=('channel', 'programme')) + logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") - # Explicit cleanup of the parser + # Also do cleanup based on processed channels as before + elif processed_channels % 1000 == 0 and processed_channels > 0: + logger.info(f"[parse_channels_only] Performing preventative memory cleanup at {processed_channels} channels") + # Close and reopen the parser to release memory + if source_file and channel_parser: + # First clear element references + elem.clear() + if elem.getparent() is not None: + elem.getparent().remove(elem) - #del channel_parser + # Reset parser state + del channel_parser + channel_parser = None + gc.collect() - logger.info(f"[parse_channels_only] Deleted channel_parser object") + # Perform thorough cleanup + cleanup_memory(log_usage=True, force_collection=True) - # Close the file - logger.info(f"[parse_channels_only] Closing file: {file_path}") - source_file.close() - logger.info(f"[parse_channels_only] File closed: {file_path}") + # Create a new parser context + # This doesn't restart from the beginning but continues from current position + channel_parser = etree.iterparse(source_file, events=('end',), tag=('channel', 'programme')) + logger.info(f"[parse_channels_only] Recreated parser context after memory cleanup") - # Force garbage collection - gc.collect() - if process: - logger.info(f"[parse_channels_only] Memory after final cleanup: {process.memory_info().rss / 1024 / 1024:.2f} MB") - - # Remove long sleep that might be causing issues - # time.sleep(200) # This seems excessive and may be causing issues + if processed_channels == total_channels: + logger.info(f"[parse_channels_only] Processed all channels current memory: {process.memory_info().rss / 1024 / 1024:.2f} MB") except (etree.XMLSyntaxError, Exception) as xml_error: logger.error(f"[parse_channels_only] XML parsing failed: {xml_error}") @@ -830,25 +828,18 @@ def parse_channels_only(source): send_epg_update(source.id, "parsing_channels", 100, status="error", error=str(xml_error)) return False if process: - logger.info(f"[parse_channels_only] Memory before final batch creation: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.debug(f"[parse_channels_only] Memory before final batch creation: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Process any remaining items if epgs_to_create: EPGData.objects.bulk_create(epgs_to_create, ignore_conflicts=True) - logger.info(f"[parse_channels_only] Created final batch of {len(epgs_to_create)} EPG entries") + logger.debug(f"[parse_channels_only] Created final batch of {len(epgs_to_create)} EPG entries") if epgs_to_update: EPGData.objects.bulk_update(epgs_to_update, ["name"]) - logger.info(f"[parse_channels_only] Updated final batch of {len(epgs_to_update)} EPG entries") + logger.debug(f"[parse_channels_only] Updated final batch of {len(epgs_to_update)} EPG entries") if process: - logger.info(f"[parse_channels_only] Memory after final batch creation: {process.memory_info().rss / 1024 / 1024:.2f} MB") - - # Final garbage collection and memory tracking - if process: - logger.info(f"[parse_channels_only] Memory before final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") - gc.collect() - if process: - logger.info(f"[parse_channels_only] Memory after final gc: {process.memory_info().rss / 1024 / 1024:.2f} MB") + logger.debug(f"[parse_channels_only] Memory after final batch creation: {process.memory_info().rss / 1024 / 1024:.2f} MB") # Update source status with channel count source.status = 'success' @@ -889,19 +880,34 @@ def parse_channels_only(source): return False finally: # Add more detailed cleanup in finally block - logger.info("In finally block, ensuring cleanup") - existing_tvg_ids = None - existing_epgs = None - - # Check final memory usage after clearing process - gc.collect() - # Add comprehensive cleanup at end of channel parsing - cleanup_memory(log_usage=True, force_collection=True) + logger.debug("In finally block, ensuring cleanup") try: - process = psutil.Process() - final_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_channels_only] Final memory usage: {final_memory:.2f} MB") - process = None + if 'channel_parser' in locals(): + del channel_parser + if 'elem' in locals(): + del elem + if 'parent' in locals(): + del parent + + if 'source_file' in locals(): + source_file.close() + del source_file + # Clear remaining large data structures + existing_epgs.clear() + epgs_to_create.clear() + epgs_to_update.clear() + existing_epgs = None + epgs_to_create = None + epgs_to_update = None + cleanup_memory(log_usage=True, force_collection=True) + except Exception as e: + logger.warning(f"Cleanup error: {e}") + + try: + if process: + final_memory = process.memory_info().rss / 1024 / 1024 + logger.debug(f"[parse_channels_only] Final memory usage: {final_memory:.2f} MB") + process = None except: pass From a74160a0b6b858e218df426684c85746fb46acf8 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 18:51:40 -0500 Subject: [PATCH 19/25] Add lxml to base image and set base to build automatically if requirements changes. --- .github/workflows/base-image.yml | 2 ++ requirements.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/base-image.yml b/.github/workflows/base-image.yml index 955043fb..1da33d4f 100644 --- a/.github/workflows/base-image.yml +++ b/.github/workflows/base-image.yml @@ -6,11 +6,13 @@ on: paths: - 'docker/DispatcharrBase' - '.github/workflows/base-image.yml' + - 'requirements.txt' pull_request: branches: [ main, dev ] paths: - 'docker/DispatcharrBase' - '.github/workflows/base-image.yml' + - 'requirements.txt' workflow_dispatch: # Allow manual triggering permissions: diff --git a/requirements.txt b/requirements.txt index 7d7117f4..22a51fbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ channels channels-redis django-filter django-celery-beat +lxml==5.4.0 \ No newline at end of file From 4141aa11e540b866f71ff14edaacae308962b8c5 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 18:55:05 -0500 Subject: [PATCH 20/25] Removed memory-profiler from requirements. --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8810e336..22a51fbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,5 +28,4 @@ channels channels-redis django-filter django-celery-beat -memory-profiler==0.61.0 -lxml==5.4.0 +lxml==5.4.0 \ No newline at end of file From 56be7b71940ef4becc1a165c93b2a6d0448f20f8 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 19:17:01 -0500 Subject: [PATCH 21/25] Added missing dc dependency for auto-match. --- apps/channels/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/channels/tasks.py b/apps/channels/tasks.py index d59169e5..6217a4ca 100755 --- a/apps/channels/tasks.py +++ b/apps/channels/tasks.py @@ -7,6 +7,7 @@ import time import json import subprocess from datetime import datetime +import gc from celery import shared_task from django.utils.text import slugify From 451c8924571fe05bfaa89c460e0e7304b08b9098 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 19:49:55 -0500 Subject: [PATCH 22/25] Changed logging levels. --- apps/epg/tasks.py | 58 +++++++++++++++++++++++++++++++---------------- core/utils.py | 7 +++--- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index f605abc6..f31064de 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -540,7 +540,9 @@ def parse_channels_only(source): # Only track memory usage when log level is DEBUG (10) or more verbose # This is more future-proof than string comparisons - if current_log_level <= logging.DEBUG or settings.DEBUG: + should_log_memory = current_log_level <= logging.DEBUG or settings.DEBUG + + if should_log_memory: process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 logger.debug(f"[parse_channels_only] Initial memory usage: {initial_memory:.2f} MB") @@ -548,6 +550,7 @@ def parse_channels_only(source): logger.debug("Memory tracking disabled in production mode") except (ImportError, NameError): process = None + should_log_memory = False logger.warning("psutil not available for memory tracking") # Replace full dictionary load with more efficient lookup set @@ -755,7 +758,7 @@ def parse_channels_only(source): except (ValueError, KeyError, TypeError): # Element might already be removed or detached pass - cleanup_memory(log_usage=True, force_collection=True) + cleanup_memory(log_usage=should_log_memory, force_collection=True) #time.sleep(.1) except Exception as e: @@ -786,7 +789,7 @@ def parse_channels_only(source): gc.collect() # Perform thorough cleanup - cleanup_memory(log_usage=True, force_collection=True) + cleanup_memory(log_usage=should_log_memory, force_collection=True) # Create a new parser context - continue looking for both tags # This doesn't restart from the beginning but continues from current position @@ -809,7 +812,7 @@ def parse_channels_only(source): gc.collect() # Perform thorough cleanup - cleanup_memory(log_usage=True, force_collection=True) + cleanup_memory(log_usage=should_log_memory, force_collection=True) # Create a new parser context # This doesn't restart from the beginning but continues from current position @@ -899,7 +902,7 @@ def parse_channels_only(source): existing_epgs = None epgs_to_create = None epgs_to_update = None - cleanup_memory(log_usage=True, force_collection=True) + cleanup_memory(log_usage=should_log_memory, force_collection=True) except Exception as e: logger.warning(f"Cleanup error: {e}") @@ -927,14 +930,25 @@ def parse_programs_for_tvg_id(epg_id): mem_before = 0 # Initialize with default value to avoid UnboundLocalError try: - # Add memory tracking + # Add memory tracking only in trace mode or higher try: - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 - mem_before = initial_memory # Set mem_before here - logger.info(f"[parse_programs_for_tvg_id] Initial memory usage: {initial_memory:.2f} MB") + process = None + # Get current log level as a number + current_log_level = logger.getEffectiveLevel() + + # Only track memory usage when log level is TRACE or more verbose + should_log_memory = current_log_level <= 5 + + if should_log_memory: + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_tvg_id] Initial memory usage: {initial_memory:.2f} MB") + mem_before = initial_memory + else: + logger.debug("Memory tracking disabled in production mode") except ImportError: process = None + should_log_memory = False epg = EPGData.objects.get(id=epg_id) epg_source = epg.epg_source @@ -1193,11 +1207,8 @@ def parse_programs_for_tvg_id(epg_id): programs_to_create = None epg_source = None - # Force garbage collection before releasing lock - gc.collect() - # Add comprehensive cleanup before releasing lock - cleanup_memory(log_usage=True, force_collection=True) + cleanup_memory(log_usage=should_log_memory, force_collection=True) # Memory tracking after processing if process: mem_after = process.memory_info().rss / 1024 / 1024 @@ -1213,11 +1224,20 @@ def parse_programs_for_source(epg_source, tvg_id=None): # Send initial programs parsing notification send_epg_update(epg_source.id, "parsing_programs", 0) #time.sleep(100) - # Add memory tracking + + # Add memory tracking only in trace mode or higher try: - process = psutil.Process() - initial_memory = process.memory_info().rss / 1024 / 1024 - logger.info(f"[parse_programs_for_source] Initial memory usage: {initial_memory:.2f} MB") + process = None + # Get current log level as a number + current_log_level = logger.getEffectiveLevel() + + # Only track memory usage when log level is TRACE or more verbose + if current_log_level <= 5 or settings.DEBUG: # Assuming TRACE is level 5 or lower + process = psutil.Process() + initial_memory = process.memory_info().rss / 1024 / 1024 + logger.info(f"[parse_programs_for_source] Initial memory usage: {initial_memory:.2f} MB") + else: + logger.debug("Memory tracking disabled in production mode") except ImportError: logger.warning("psutil not available for memory tracking") process = None @@ -1333,7 +1353,7 @@ def parse_programs_for_source(epg_source, tvg_id=None): gc.collect() # Add comprehensive memory cleanup at the end - cleanup_memory(log_usage=True, force_collection=True) + cleanup_memory(log_usage=should_log_memory, force_collection=True) if process: final_memory = process.memory_info().rss / 1024 / 1024 logger.info(f"[parse_programs_for_source] Final memory usage: {final_memory:.2f} MB difference: {final_memory - initial_memory:.2f} MB") diff --git a/core/utils.py b/core/utils.py index d26b0dd3..fcff03e5 100644 --- a/core/utils.py +++ b/core/utils.py @@ -246,7 +246,7 @@ def monitor_memory_usage(func): return result return wrapper -def cleanup_memory(log_usage=True, force_collection=True): +def cleanup_memory(log_usage=False, force_collection=True): """ Comprehensive memory cleanup function to reduce memory footprint @@ -255,8 +255,9 @@ def cleanup_memory(log_usage=True, force_collection=True): force_collection: Whether to force garbage collection """ logger.trace("Starting memory cleanup django memory cleanup") - # Skip logging if log level is not set to debug (no reason to run memory usage if we don't log it) - if not logger.isEnabledFor(logging.DEBUG): + # Skip logging if log level is not set to debug or more verbose (like trace) + current_log_level = logger.getEffectiveLevel() + if not current_log_level <= logging.DEBUG: log_usage = False if log_usage: try: From 55089044fa58c562e22cead259082b29fe7b5b84 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 20:07:10 -0500 Subject: [PATCH 23/25] Cleaned up code and logging a bit more. --- apps/epg/tasks.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index f31064de..05612001 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -570,7 +570,6 @@ def parse_channels_only(source): existing_tvg_ids.update(tvg_id_chunk) last_id = EPGData.objects.filter(tvg_id__in=tvg_id_chunk).order_by('-id')[0].id - #time.sleep(20) # Update progress to show file read starting send_epg_update(source.id, "parsing_channels", 10) @@ -759,7 +758,6 @@ def parse_channels_only(source): # Element might already be removed or detached pass cleanup_memory(log_usage=should_log_memory, force_collection=True) - #time.sleep(.1) except Exception as e: # Just log the error and continue - don't let cleanup errors stop processing @@ -861,8 +859,7 @@ def parse_channels_only(source): send_websocket_update('updates', 'update', {"success": True, "type": "epg_channels"}) logger.info(f"Finished parsing channel info. Found {processed_channels} channels.") - # Remove excessive sleep - # time.sleep(20) + return True except FileNotFoundError: @@ -1223,16 +1220,19 @@ def parse_programs_for_tvg_id(epg_id): def parse_programs_for_source(epg_source, tvg_id=None): # Send initial programs parsing notification send_epg_update(epg_source.id, "parsing_programs", 0) - #time.sleep(100) + should_log_memory = False + process = None + initial_memory = 0 # Add memory tracking only in trace mode or higher try: - process = None # Get current log level as a number current_log_level = logger.getEffectiveLevel() # Only track memory usage when log level is TRACE or more verbose - if current_log_level <= 5 or settings.DEBUG: # Assuming TRACE is level 5 or lower + should_log_memory = current_log_level <= 5 or settings.DEBUG # Assuming TRACE is level 5 or lower + + if should_log_memory: process = psutil.Process() initial_memory = process.memory_info().rss / 1024 / 1024 logger.info(f"[parse_programs_for_source] Initial memory usage: {initial_memory:.2f} MB") @@ -1241,6 +1241,7 @@ def parse_programs_for_source(epg_source, tvg_id=None): except ImportError: logger.warning("psutil not available for memory tracking") process = None + should_log_memory = False try: # Process EPG entries in batches rather than all at once From 422bd0577af1f675b924a2478b4e399231e4f526 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 20:13:21 -0500 Subject: [PATCH 24/25] Removed cleanup from celery task. --- core/tasks.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/core/tasks.py b/core/tasks.py index 9af367f9..eee5b18f 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -2,7 +2,6 @@ from celery import shared_task from channels.layers import get_channel_layer from asgiref.sync import async_to_sync -import redis import json import logging import re @@ -47,11 +46,6 @@ def throttled_log(logger_method, message, key=None, *args, **kwargs): logger_method(message, *args, **kwargs) _last_log_times[key] = now -def clear_memory(): - """Force aggressive garbage collection to free memory""" - from core.utils import cleanup_memory - cleanup_memory(log_usage=True, force_collection=True) - @shared_task def beat_periodic_task(): fetch_channel_stats() @@ -275,9 +269,6 @@ def scan_and_process_files(): # Mark that the first scan is complete _first_scan_completed = True - # Force memory cleanup - clear_memory() - def fetch_channel_stats(): redis_client = RedisClient.get_client() @@ -314,7 +305,6 @@ def fetch_channel_stats(): # Explicitly clean up large data structures all_channels = None - gc.collect() except Exception as e: logger.error(f"Error in channel_status: {e}", exc_info=True) From 72783090cd03071750d7f4de1ffbb5ad4c366c56 Mon Sep 17 00:00:00 2001 From: SergeantPanda Date: Tue, 20 May 2025 20:36:04 -0500 Subject: [PATCH 25/25] Delete all instead of using a loop. --- apps/epg/tasks.py | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/apps/epg/tasks.py b/apps/epg/tasks.py index 05612001..60428484 100644 --- a/apps/epg/tasks.py +++ b/apps/epg/tasks.py @@ -918,14 +918,10 @@ def parse_programs_for_tvg_id(epg_id): if not acquire_task_lock('parse_epg_programs', epg_id): logger.info(f"Program parse for {epg_id} already in progress, skipping duplicate task") return "Task already running" + source_file = None program_parser = None - programs_to_create = None - epg = None - epg_source = None - programs_processed = 0 - mem_before = 0 # Initialize with default value to avoid UnboundLocalError - + programs_to_create = [] try: # Add memory tracking only in trace mode or higher try: @@ -957,18 +953,9 @@ def parse_programs_for_tvg_id(epg_id): logger.info(f"Refreshing program data for tvg_id: {epg.tvg_id}") - # First, remove all existing programs - use chunked delete to avoid memory issues - # Delete old programs - chunk_size = 5000 - last_id = 0 - while True: - ids = list(ProgramData.objects.filter(epg=epg, id__gt=last_id).order_by('id').values_list('id', flat=True)[:chunk_size]) - if not ids: - break - ProgramData.objects.filter(id__in=ids).delete() - last_id = ids[-1] - del ids - gc.collect() + # Optimize deletion with a single delete query instead of chunking + # This is faster for most database engines + ProgramData.objects.filter(epg=epg).delete() file_path = epg_source.file_path if not file_path: @@ -1106,11 +1093,10 @@ def parse_programs_for_tvg_id(epg_id): if len(programs_to_create) >= batch_size: ProgramData.objects.bulk_create(programs_to_create) logger.debug(f"Saved batch of {len(programs_to_create)} programs for {epg.tvg_id}") - del programs_to_create - del custom_props - del custom_properties_json - gc.collect() - #continue + programs_to_create = [] + # Only call gc.collect() every few batches + if programs_processed % (batch_size * 5) == 0: + gc.collect() except Exception as e: logger.error(f"Error processing program for {epg.tvg_id}: {e}", exc_info=True)