Enhancement: Improve channel shutdown logic in ProxyServer to handle connection timeouts and grace periods more effectively, ensuring proper channel management based on client connections.

2026-01-23 02:35:14 +00:00 · 2025-11-15 14:22:26 -06:00 · 2025-11-15 14:22:26 -06:00 · 6bd5958c3c
commit 6bd5958c3c
parent 0700cf29ea
1 changed files with 52 additions and 16 deletions
--- a/apps/proxy/ts_proxy/server.py
+++ b/apps/proxy/ts_proxy/server.py
@ -947,7 +947,7 @@ class ProxyServer:

                            # If in connecting or waiting_for_clients state, check grace period
                            if channel_state in [ChannelState.CONNECTING, ChannelState.WAITING_FOR_CLIENTS]:
-                                # Get connection ready time from metadata
+                                # Get connection_ready_time from metadata (indicates if channel reached ready state)
                                connection_ready_time = None
                                if metadata and b'connection_ready_time' in metadata:
                                    try:
@ -955,17 +955,60 @@ class ProxyServer:
                                    except (ValueError, TypeError):
                                        pass

-                                # If still connecting, give it more time
-                                if channel_state == ChannelState.CONNECTING:
-                                    logger.debug(f"Channel {channel_id} still connecting - not checking for clients yet")
-                                    continue
+                                if total_clients == 0:
+                                    # Check if we have a connection_attempt timestamp (set when CONNECTING starts)
+                                    connection_attempt_time = None
+                                    attempt_key = RedisKeys.connection_attempt(channel_id)
+                                    if self.redis_client:
+                                        attempt_value = self.redis_client.get(attempt_key)
+                                        if attempt_value:
+                                            try:
+                                                connection_attempt_time = float(attempt_value.decode('utf-8'))
+                                            except (ValueError, TypeError):
+                                                pass

-                                # If waiting for clients, check grace period
-                                if connection_ready_time:
+                                    # Also get init time as a fallback
+                                    init_time = None
+                                    if metadata and b'init_time' in metadata:
+                                        try:
+                                            init_time = float(metadata[b'init_time'].decode('utf-8'))
+                                        except (ValueError, TypeError):
+                                            pass
+
+                                    # Use whichever timestamp we have (prefer connection_attempt as it's more recent)
+                                    start_time = connection_attempt_time or init_time
+
+                                    if start_time:
+                                        # Check which timeout to apply based on channel lifecycle
+                                        if connection_ready_time:
+                                            # Already reached ready - use shutdown_delay
+                                            time_since_ready = time.time() - connection_ready_time
+                                            shutdown_delay = ConfigHelper.channel_shutdown_delay()
+
+                                            if time_since_ready > shutdown_delay:
+                                                logger.warning(
+                                                    f"Channel {channel_id} in {channel_state} state with 0 clients for {time_since_ready:.1f}s "
+                                                    f"(after reaching ready, shutdown_delay: {shutdown_delay}s) - stopping channel"
+                                                )
+                                                self.stop_channel(channel_id)
+                                                continue
+                                        else:
+                                            # Never reached ready - use grace_period timeout
+                                            time_since_start = time.time() - start_time
+                                            connecting_timeout = ConfigHelper.channel_init_grace_period()
+
+                                            if time_since_start > connecting_timeout:
+                                                logger.warning(
+                                                    f"Channel {channel_id} stuck in {channel_state} state for {time_since_start:.1f}s "
+                                                    f"with no clients (timeout: {connecting_timeout}s) - stopping channel due to upstream issues"
+                                                )
+                                                self.stop_channel(channel_id)
+                                                continue
+                                elif connection_ready_time:
+                                    # We have clients now, but check grace period for state transition
                                    grace_period = ConfigHelper.channel_init_grace_period()
                                    time_since_ready = time.time() - connection_ready_time

-                                    # Add this debug log
                                    logger.debug(f"GRACE PERIOD CHECK: Channel {channel_id} in {channel_state} state, "
                                                 f"time_since_ready={time_since_ready:.1f}s, grace_period={grace_period}s, "
                                                 f"total_clients={total_clients}")
@ -974,16 +1017,9 @@ class ProxyServer:
                                        # Still within grace period
                                        logger.debug(f"Channel {channel_id} in grace period - {time_since_ready:.1f}s of {grace_period}s elapsed")
                                        continue
-                                    elif total_clients == 0:
-                                        # Grace period expired with no clients
-                                        logger.info(f"Grace period expired ({time_since_ready:.1f}s > {grace_period}s) with no clients - stopping channel {channel_id}")
-                                        self.stop_channel(channel_id)
                                    else:
-                                        # Grace period expired but we have clients - mark channel as active
+                                        # Grace period expired with clients - mark channel as active
                                        logger.info(f"Grace period expired with {total_clients} clients - marking channel {channel_id} as active")
-                                        old_state = "unknown"
-                                        if metadata and b'state' in metadata:
-                                            old_state = metadata[b'state'].decode('utf-8')
                                        if self.update_channel_state(channel_id, ChannelState.ACTIVE, {
                                            "grace_period_ended_at": str(time.time()),
                                            "clients_at_activation": str(total_clients)