relay_client.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. from __future__ import annotations
  2. import asyncio
  3. import contextlib
  4. import random
  5. import socket
  6. from dataclasses import dataclass
  7. import time
  8. from typing import Awaitable, Callable, Dict
  9. from .config import Config, RelayNode
  10. from .protocol import AUTH, PING, PONG, STATUS_OK, TCP_CLOSE, Frame, encode_json, read_frame, write_frame
  11. from .scheduler import Scheduler
  12. FrameHandler = Callable[["RelayConnection", Frame], Awaitable[None]]
  13. @dataclass
  14. class RelayConnection:
  15. node: RelayNode
  16. manager: "RelayManager"
  17. reader: asyncio.StreamReader
  18. writer: asyncio.StreamWriter
  19. closed: bool = False
  20. handlers: Dict[tuple[int, int], FrameHandler] = None
  21. dispatch_tasks: Dict[tuple[int, int], asyncio.Task] = None
  22. pump_task: asyncio.Task | None = None
  23. keepalive_task: asyncio.Task | None = None
  24. last_pong_at: float = 0.0
  25. send_lock: asyncio.Lock | None = None
  26. closed_event: asyncio.Event | None = None
  27. def __post_init__(self) -> None:
  28. if self.handlers is None:
  29. self.handlers = {}
  30. if self.dispatch_tasks is None:
  31. self.dispatch_tasks = {}
  32. if self.send_lock is None:
  33. self.send_lock = asyncio.Lock()
  34. if self.closed_event is None:
  35. self.closed_event = asyncio.Event()
  36. async def start(self) -> None:
  37. print(f"[edge] connecting relay name={self.node.name} addr={self.node.host}:{self.node.port}")
  38. await write_frame(self.writer, Frame(AUTH, 0, 0, 0, 0, encode_json({"token": self.node.token})))
  39. frame = await read_frame(self.reader)
  40. if frame.kind != AUTH or frame.packet_id != STATUS_OK:
  41. raise ConnectionError(f"relay auth failed: {self.node.name}")
  42. print(f"[edge] relay connected name={self.node.name} addr={self.node.host}:{self.node.port}")
  43. self.last_pong_at = time.monotonic()
  44. self.keepalive_task = asyncio.create_task(self._keepalive())
  45. self.pump_task = asyncio.create_task(self._pump())
  46. async def _keepalive(self) -> None:
  47. try:
  48. while not self.closed:
  49. await asyncio.sleep(self.manager.config.relay_ping_interval)
  50. if self.closed:
  51. break
  52. if self.last_pong_at and time.monotonic() - self.last_pong_at > (self.manager.config.relay_ping_interval + self.manager.config.relay_ping_timeout):
  53. print(f"[edge] relay health timeout name={self.node.name} addr={self.node.host}:{self.node.port} timeout={self.manager.config.relay_ping_timeout}")
  54. await self.close()
  55. break
  56. await self.send(Frame(PING, 0, 0, 0, 0, b""))
  57. except asyncio.CancelledError:
  58. pass
  59. except Exception:
  60. await self.close()
  61. async def _pump(self) -> None:
  62. try:
  63. while True:
  64. frame = await read_frame(self.reader)
  65. if frame.kind == PONG:
  66. self.last_pong_at = time.monotonic()
  67. continue
  68. handler = self.handlers.get((frame.session_id, frame.stream_id))
  69. if handler:
  70. self._dispatch_frame(frame, handler)
  71. else:
  72. print(f"[edge] relay frame dropped name={self.node.name} session={frame.session_id} stream={frame.stream_id} kind={frame.kind}")
  73. except asyncio.IncompleteReadError:
  74. print(f"[edge] relay disconnected name={self.node.name} eof=true")
  75. except Exception as exc:
  76. print(f"[edge] relay pump error name={self.node.name} error={exc!r}")
  77. finally:
  78. await self.close()
  79. def _dispatch_frame(self, frame: Frame, handler: FrameHandler) -> None:
  80. key = (frame.session_id, frame.stream_id)
  81. previous = self.dispatch_tasks.get(key)
  82. task = asyncio.create_task(self._run_handler(key, frame, handler, previous))
  83. self.dispatch_tasks[key] = task
  84. async def _run_handler(self, key: tuple[int, int], frame: Frame, handler: FrameHandler, previous: asyncio.Task | None) -> None:
  85. try:
  86. if previous is not None:
  87. with contextlib.suppress(Exception):
  88. await previous
  89. if self.closed:
  90. return
  91. await handler(self, frame)
  92. except asyncio.CancelledError:
  93. pass
  94. except Exception:
  95. if not self.closed:
  96. await self.close()
  97. finally:
  98. if self.dispatch_tasks.get(key) is asyncio.current_task():
  99. self.dispatch_tasks.pop(key, None)
  100. async def send(self, frame: Frame) -> None:
  101. if self.closed:
  102. raise ConnectionError(f"relay closed: {self.node.name}")
  103. assert self.send_lock is not None
  104. async with self.send_lock:
  105. if self.closed:
  106. raise ConnectionError(f"relay closed: {self.node.name}")
  107. await write_frame(self.writer, frame)
  108. def bind(self, session_id: int, stream_id: int, handler: FrameHandler) -> None:
  109. self.handlers[(session_id, stream_id)] = handler
  110. def unbind(self, session_id: int, stream_id: int) -> None:
  111. self.handlers.pop((session_id, stream_id), None)
  112. task = self.dispatch_tasks.pop((session_id, stream_id), None)
  113. if task is not None:
  114. task.cancel()
  115. async def close(self) -> None:
  116. if self.closed:
  117. return
  118. self.closed = True
  119. assert self.closed_event is not None
  120. self.closed_event.set()
  121. handlers = list(self.handlers.items())
  122. self.handlers.clear()
  123. dispatch_tasks = list(self.dispatch_tasks.values())
  124. self.dispatch_tasks.clear()
  125. self.manager.on_closed(self)
  126. for (session_id, stream_id), handler in handlers:
  127. with contextlib.suppress(Exception):
  128. await handler(self, Frame(TCP_CLOSE, session_id, stream_id, 0, 0, b""))
  129. for task in dispatch_tasks:
  130. task.cancel()
  131. for task in dispatch_tasks:
  132. with contextlib.suppress(Exception):
  133. await task
  134. if self.pump_task and self.pump_task is not asyncio.current_task():
  135. self.pump_task.cancel()
  136. with contextlib.suppress(Exception):
  137. await self.pump_task
  138. if self.keepalive_task and self.keepalive_task is not asyncio.current_task():
  139. self.keepalive_task.cancel()
  140. with contextlib.suppress(Exception):
  141. await self.keepalive_task
  142. self.writer.close()
  143. with contextlib.suppress(Exception):
  144. await self.writer.wait_closed()
  145. class RelayManager:
  146. def __init__(self, config: Config) -> None:
  147. self.config = config
  148. self.scheduler = Scheduler(config)
  149. self.connections: Dict[str, RelayConnection] = {}
  150. self.tasks: list[asyncio.Task] = []
  151. async def start(self) -> None:
  152. await self.scheduler.start()
  153. for node in self.config.relays:
  154. self.tasks.append(asyncio.create_task(self._maintain(node)))
  155. async def _maintain(self, node: RelayNode) -> None:
  156. backoff = self.config.relay_reconnect_delay
  157. while True:
  158. current = self.connections.get(node.name)
  159. if current is not None and not current.closed:
  160. assert current.closed_event is not None
  161. await current.closed_event.wait()
  162. continue
  163. attempt = 1
  164. while True:
  165. try:
  166. print(f"[edge] relay reconnect attempt name={node.name} addr={node.host}:{node.port} attempt={attempt} backoff={backoff:.1f}s")
  167. reader, writer = await asyncio.wait_for(asyncio.open_connection(node.host, node.port), timeout=self.config.relay_open_timeout)
  168. connection = RelayConnection(node=node, manager=self, reader=reader, writer=writer)
  169. sock = writer.get_extra_info("socket")
  170. if sock is not None and self.config.relay_tcp_nodelay:
  171. with contextlib.suppress(OSError):
  172. sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
  173. await connection.start()
  174. self.connections[node.name] = connection
  175. backoff = self.config.relay_reconnect_delay
  176. assert connection.closed_event is not None
  177. await connection.closed_event.wait()
  178. print(f"[edge] relay supervisor noticed close name={node.name} addr={node.host}:{node.port}")
  179. break
  180. except asyncio.CancelledError:
  181. raise
  182. except Exception as exc:
  183. print(f"[edge] relay connect failed name={node.name} addr={node.host}:{node.port} attempt={attempt} error={exc!r}")
  184. jitter = random.uniform(0, min(1.0, backoff * 0.2))
  185. await asyncio.sleep(backoff + jitter)
  186. backoff = min(self.config.relay_reconnect_max_delay, max(self.config.relay_reconnect_delay, backoff * 2))
  187. attempt += 1
  188. def on_closed(self, connection: RelayConnection) -> None:
  189. current = self.connections.get(connection.node.name)
  190. if current is connection:
  191. self.connections.pop(connection.node.name, None)
  192. def available(self) -> list[RelayConnection]:
  193. chosen = {node.name for node in self.scheduler.choose()}
  194. preferred = [self.connections[name] for name in chosen if name in self.connections and not self.connections[name].closed]
  195. if preferred:
  196. return preferred
  197. return [conn for conn in self.connections.values() if not conn.closed]
  198. def snapshot(self) -> list[dict[str, object]]:
  199. data = self.scheduler.snapshot()
  200. online = {name for name, conn in self.connections.items() if not conn.closed}
  201. for item in data:
  202. item["online"] = item["name"] in online
  203. return data