Selaa lähdekoodia

解决子节点不稳定

Gogs 4 päivää sitten
vanhempi
commit
6be992fb60
4 muutettua tiedostoa jossa 38 lisäystä ja 13 poistoa
  1. 2 0
      config.json
  2. 2 0
      config.py
  3. 25 13
      relay_client.py
  4. 9 0
      scheduler.py

+ 2 - 0
config.json

@@ -7,6 +7,8 @@
   "tcp_warmup_bytes": 1048576,
   "tcp_loser_grace_ms": 1500,
   "probe_interval": 3,
+  "relay_reconnect_delay": 1,
+  "relay_reconnect_max_delay": 10,
   "relays": [
     {"name": "hk1", "host": "23.95.134.159", "port": 9009, "token": "130", "weight": 100},
     {"name": "hk2", "host": "23.238.9.140", "port": 9009, "token": "130", "weight": 100}

+ 2 - 0
config.py

@@ -32,6 +32,7 @@ class Config:
     tcp_connect_happy_eyeballs_delay: float | None = None
     relay_reconnect_delay: float = 3.0
     relay_reconnect_attempts: int = 5
+    relay_reconnect_max_delay: float = 30.0
     relay_ping_interval: float = 10.0
     relay_ping_timeout: float = 25.0
     relay_tcp_nodelay: bool = True
@@ -57,6 +58,7 @@ class Config:
             tcp_connect_happy_eyeballs_delay=raw.get("tcp_connect_happy_eyeballs_delay"),
             relay_reconnect_delay=raw.get("relay_reconnect_delay", 3.0),
             relay_reconnect_attempts=max(1, raw.get("relay_reconnect_attempts", 5)),
+            relay_reconnect_max_delay=max(raw.get("relay_reconnect_delay", 3.0), raw.get("relay_reconnect_max_delay", 30.0)),
             relay_ping_interval=max(1.0, raw.get("relay_ping_interval", 10.0)),
             relay_ping_timeout=max(1.0, raw.get("relay_ping_timeout", 25.0)),
             relay_tcp_nodelay=raw.get("relay_tcp_nodelay", True),

+ 25 - 13
relay_client.py

@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import asyncio
 import contextlib
+import random
 import socket
 from dataclasses import dataclass
 import time
@@ -27,6 +28,7 @@ class RelayConnection:
     keepalive_task: asyncio.Task | None = None
     last_pong_at: float = 0.0
     send_lock: asyncio.Lock | None = None
+    closed_event: asyncio.Event | None = None
 
     def __post_init__(self) -> None:
         if self.handlers is None:
@@ -35,6 +37,8 @@ class RelayConnection:
             self.dispatch_tasks = {}
         if self.send_lock is None:
             self.send_lock = asyncio.Lock()
+        if self.closed_event is None:
+            self.closed_event = asyncio.Event()
 
     async def start(self) -> None:
         print(f"[edge] connecting relay name={self.node.name} addr={self.node.host}:{self.node.port}")
@@ -127,6 +131,8 @@ class RelayConnection:
         if self.closed:
             return
         self.closed = True
+        assert self.closed_event is not None
+        self.closed_event.set()
         handlers = list(self.handlers.items())
         self.handlers.clear()
         dispatch_tasks = list(self.dispatch_tasks.values())
@@ -166,14 +172,17 @@ class RelayManager:
             self.tasks.append(asyncio.create_task(self._maintain(node)))
 
     async def _maintain(self, node: RelayNode) -> None:
+        backoff = self.config.relay_reconnect_delay
         while True:
-            if node.name in self.connections and not self.connections[node.name].closed:
-                await asyncio.sleep(2)
+            current = self.connections.get(node.name)
+            if current is not None and not current.closed:
+                assert current.closed_event is not None
+                await current.closed_event.wait()
                 continue
-            connected = False
-            for attempt in range(1, self.config.relay_reconnect_attempts + 1):
+            attempt = 1
+            while True:
                 try:
-                    print(f"[edge] relay reconnect attempt name={node.name} addr={node.host}:{node.port} attempt={attempt}/{self.config.relay_reconnect_attempts}")
+                    print(f"[edge] relay reconnect attempt name={node.name} addr={node.host}:{node.port} attempt={attempt} backoff={backoff:.1f}s")
                     reader, writer = await asyncio.wait_for(asyncio.open_connection(node.host, node.port), timeout=self.config.relay_open_timeout)
                     connection = RelayConnection(node=node, manager=self, reader=reader, writer=writer)
                     sock = writer.get_extra_info("socket")
@@ -182,16 +191,19 @@ class RelayManager:
                             sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
                     await connection.start()
                     self.connections[node.name] = connection
-                    connected = True
-                    await connection.pump_task
+                    backoff = self.config.relay_reconnect_delay
+                    assert connection.closed_event is not None
+                    await connection.closed_event.wait()
+                    print(f"[edge] relay supervisor noticed close name={node.name} addr={node.host}:{node.port}")
                     break
+                except asyncio.CancelledError:
+                    raise
                 except Exception as exc:
-                    print(f"[edge] relay connect failed name={node.name} addr={node.host}:{node.port} attempt={attempt}/{self.config.relay_reconnect_attempts} error={exc!r}")
-                    if attempt < self.config.relay_reconnect_attempts:
-                        await asyncio.sleep(self.config.relay_reconnect_delay)
-            if not connected:
-                print(f"[edge] relay reconnect exhausted name={node.name} addr={node.host}:{node.port} attempts={self.config.relay_reconnect_attempts}")
-                await asyncio.sleep(self.config.relay_reconnect_delay)
+                    print(f"[edge] relay connect failed name={node.name} addr={node.host}:{node.port} attempt={attempt} error={exc!r}")
+                    jitter = random.uniform(0, min(1.0, backoff * 0.2))
+                    await asyncio.sleep(backoff + jitter)
+                    backoff = min(self.config.relay_reconnect_max_delay, max(self.config.relay_reconnect_delay, backoff * 2))
+                    attempt += 1
 
     def on_closed(self, connection: RelayConnection) -> None:
         current = self.connections.get(connection.node.name)

+ 9 - 0
scheduler.py

@@ -5,6 +5,7 @@ import time
 from dataclasses import dataclass
 
 from .config import Config, RelayNode
+from .protocol import AUTH, PING, PONG, STATUS_OK, Frame, encode_json, read_frame, write_frame
 
 
 @dataclass
@@ -39,6 +40,14 @@ class Scheduler:
         started = time.perf_counter()
         try:
             reader, writer = await asyncio.wait_for(asyncio.open_connection(node.host, node.port), timeout=3)
+            await write_frame(writer, Frame(AUTH, 0, 0, 0, 0, encode_json({"token": node.token})))
+            auth = await asyncio.wait_for(read_frame(reader), timeout=3)
+            if auth.kind != AUTH or auth.packet_id != STATUS_OK:
+                raise ConnectionError(f"relay auth probe failed: {node.name}")
+            await write_frame(writer, Frame(PING, 0, 0, 1, 0, b""))
+            pong = await asyncio.wait_for(read_frame(reader), timeout=3)
+            if pong.kind != PONG:
+                raise ConnectionError(f"relay ping probe failed: {node.name}")
             writer.close()
             await writer.wait_closed()
             elapsed = (time.perf_counter() - started) * 1000