feat(browser): add remote Playwright session support (Tier 2) (#325)

Add native Playwright WebSocket sessions alongside the existing curated
browser tools. Agents can now create remote sessions that expose a full
Playwright API over WebSocket, enabling advanced use cases like HttpOnly
cookie injection, storage state management, and route interception.

Key changes:
- Per-bot isolated browser processes (launchServer via Node child process)
- New session module with create/close/status/heartbeat endpoints
- New browser_remote_session agent tool (Go)
- Storage state export/import on existing browser contexts
- Bot ID plumbing through context creation for process isolation
- Inflight deduplication to prevent duplicate browser launches
- Session janitor for automatic expiry cleanup
This commit is contained in:
Lakr
2026-04-05 00:49:05 +09:00
committed by GitHub
parent 887b6235bb
commit daed345908
6 changed files with 624 additions and 9 deletions
+186
View File
@@ -1,8 +1,185 @@
import { chromium, firefox } from 'playwright'
import type { Browser } from 'playwright'
import { spawn, type ChildProcess } from 'child_process'
export type BrowserCore = 'chromium' | 'firefox'
// --- Per-bot browser entry ---
// Uses launch() for the gateway's own Browser handle (Bun-compatible),
// and spawns a Node child process running launchServer() for the Tier 2
// remote WS endpoint that Python clients connect to.
export interface BotBrowserEntry {
botId: string
core: BrowserCore
browser: Browser
// Tier 2: remote WS endpoint for native Playwright clients
wsEndpoint?: string
serverProcess?: ChildProcess
}
const botBrowsers = new Map<string, BotBrowserEntry>()
const inflightBrowserCreations = new Map<string, Promise<BotBrowserEntry>>()
const MAX_BOT_BROWSERS = parseInt(process.env.MAX_BOT_BROWSER_SERVERS ?? '20', 10)
function getBrowserType(core: BrowserCore) {
return core === 'firefox' ? firefox : chromium
}
export async function getOrCreateBotBrowser(botId: string, core: BrowserCore): Promise<BotBrowserEntry> {
const existing = botBrowsers.get(botId)
if (existing) {
if (existing.core !== core) {
// Reject core change if active sessions/contexts exist
throw new Error(`Bot ${botId} already has a ${existing.core} browser. Cannot switch to ${core} while active.`)
}
return existing
}
// Deduplicate concurrent creation for the same bot
const inflight = inflightBrowserCreations.get(botId)
if (inflight) return inflight
const promise = (async () => {
if (botBrowsers.size >= MAX_BOT_BROWSERS) {
throw new Error(`Browser limit reached (${MAX_BOT_BROWSERS}). Cannot create new browser for bot ${botId}.`)
}
const browserType = getBrowserType(core)
const browser = await browserType.launch({ headless: true })
const entry: BotBrowserEntry = { botId, core, browser }
botBrowsers.set(botId, entry)
browser.on('disconnected', () => {
botBrowsers.delete(botId)
console.log(`Browser for bot ${botId} disconnected unexpectedly, cleaned up.`)
})
console.log(`Launched browser for bot ${botId} (${core})`)
return entry
})().finally(() => {
inflightBrowserCreations.delete(botId)
})
inflightBrowserCreations.set(botId, promise)
return promise
}
// Launch a Tier 2 remote server for a bot (Node child process running launchServer)
// Returns the WS endpoint that remote Python clients can connect to.
export async function ensureBotRemoteServer(botId: string, core: BrowserCore): Promise<string> {
const entry = botBrowsers.get(botId)
if (entry?.wsEndpoint && entry.serverProcess) {
return entry.wsEndpoint
}
// Ensure bot has a Tier 1 browser first (for gateway-side operations)
await getOrCreateBotBrowser(botId, core)
const wsPath = `/${crypto.randomUUID()}`
const wsEndpoint = await launchRemoteServer(botId, core, wsPath)
console.log(`Remote WS server for bot ${botId} at ${wsEndpoint}`)
return wsEndpoint
}
// Spawn a Node child process that runs launchServer() and returns the WS endpoint
function launchRemoteServer(botId: string, core: BrowserCore, wsPath: string): Promise<string> {
return new Promise((resolve, reject) => {
const script = `
const { ${core} } = require('playwright');
(async () => {
const server = await ${core}.launchServer({ headless: true, wsPath: '${wsPath}' });
process.stdout.write('WS:' + server.wsEndpoint() + '\\n');
process.on('SIGTERM', () => { server.close().then(() => process.exit(0)); });
process.on('SIGINT', () => { server.close().then(() => process.exit(0)); });
})().catch(e => { process.stderr.write(e.message); process.exit(1); });
`
const child = spawn('node', ['-e', script], {
cwd: process.cwd(),
env: process.env,
stdio: ['ignore', 'pipe', 'pipe'],
})
let resolved = false
const timeout = setTimeout(() => {
if (!resolved) {
resolved = true
child.kill()
reject(new Error('Timed out launching remote browser server'))
}
}, 30000)
child.stdout!.on('data', (data: Buffer) => {
const line = data.toString().trim()
if (line.startsWith('WS:') && !resolved) {
resolved = true
clearTimeout(timeout)
const wsEndpoint = line.slice(3)
// Directly assign to the correct bot entry by ID
const botEntry = botBrowsers.get(botId)
if (botEntry) {
botEntry.serverProcess = child
botEntry.wsEndpoint = wsEndpoint
}
child.on('exit', () => {
if (botEntry) {
botEntry.wsEndpoint = undefined
botEntry.serverProcess = undefined
console.log(`Remote server process for bot ${botId} exited`)
}
})
resolve(wsEndpoint)
}
})
child.stderr!.on('data', (data: Buffer) => {
if (!resolved) {
resolved = true
clearTimeout(timeout)
reject(new Error(`Remote server error: ${data.toString()}`))
}
})
child.on('error', (err) => {
if (!resolved) {
resolved = true
clearTimeout(timeout)
reject(err)
}
})
})
}
export function getBotBrowser(botId: string): BotBrowserEntry | undefined {
return botBrowsers.get(botId)
}
export async function closeBotBrowser(botId: string): Promise<void> {
const entry = botBrowsers.get(botId)
if (!entry) return
botBrowsers.delete(botId)
if (entry.serverProcess) {
entry.serverProcess.kill('SIGTERM')
}
try {
await entry.browser.close()
} catch { /* browser may already be closed */ }
console.log(`Closed browser for bot ${botId}`)
}
export function getAllBotBrowsers(): Map<string, BotBrowserEntry> {
return botBrowsers
}
// --- Shared fallback browser (backward compat for requests without bot_id) ---
export const browsers = new Map<BrowserCore, Browser>()
export const initBrowsers = async (): Promise<Map<BrowserCore, Browser>> => {
@@ -34,3 +211,12 @@ export const getAvailableCores = (): BrowserCore[] => {
const raw = process.env.BROWSER_CORES ?? 'chromium'
return raw.split(',').map(s => s.trim()) as BrowserCore[]
}
// --- Shutdown all ---
export async function closeAllBotBrowsers(): Promise<void> {
const ids = [...botBrowsers.keys()]
for (const id of ids) {
await closeBotBrowser(id)
}
}
+5 -1
View File
@@ -2,10 +2,11 @@ import { Elysia } from 'elysia'
import { loadConfig } from '@memohai/config'
import { corsMiddleware } from './middlewares/cors'
import { errorMiddleware } from './middlewares/error'
import { initBrowsers, browsers } from './browser'
import { initBrowsers, browsers, closeAllBotBrowsers } from './browser'
import { contextModule } from './modules/context'
import { devicesModule } from './modules/devices'
import { coresModule } from './modules/cores'
import { sessionModule, closeAllSessions } from './modules/session'
const configuredPath = process.env.MEMOH_CONFIG_PATH?.trim() || process.env.CONFIG_PATH?.trim()
const configPath = configuredPath && configuredPath.length > 0 ? configuredPath : '../../config.toml'
@@ -23,8 +24,11 @@ const app = new Elysia()
}))
.use(coresModule)
.use(contextModule)
.use(sessionModule)
.use(devicesModule)
.onStop(async () => {
await closeAllSessions()
await closeAllBotBrowsers()
for (const browser of browsers.values()) {
await browser.close()
}
+53 -5
View File
@@ -2,7 +2,7 @@ import { Elysia } from 'elysia'
import { storage } from '../storage'
import { z } from 'zod'
import { BrowserContextConfigModel } from '../models'
import { getBrowser } from '../browser'
import { getBrowser, getOrCreateBotBrowser } from '../browser'
import { actionModule } from './action'
export const contextModule = new Elysia({ prefix: '/context' })
@@ -22,10 +22,25 @@ export const contextModule = new Elysia({ prefix: '/context' })
})
.post(
'/',
async ({ body }) => {
const { name, config, id } = body
async ({ body, set }) => {
const { name, config, id, bot_id } = body
const core = config.core ?? 'chromium'
const browser = getBrowser(core)
// Reject duplicate context IDs to prevent orphaning live contexts
if (storage.has(id)) {
set.status = 409
return { error: `context with id "${id}" already exists` }
}
// Use per-bot isolated browser process if bot_id provided, otherwise shared fallback
let browser
if (bot_id) {
const botEntry = await getOrCreateBotBrowser(bot_id, core)
browser = botEntry.browser
} else {
browser = getBrowser(core)
}
const context = await browser.newContext({
viewport: config.viewport,
userAgent: config.userAgent,
@@ -39,7 +54,7 @@ export const contextModule = new Elysia({ prefix: '/context' })
ignoreHTTPSErrors: config.ignoreHTTPSErrors,
proxy: config.proxy,
})
storage.set(id, { id, name, core, context, config })
storage.set(id, { id, name, botId: bot_id, core, context, config })
return { id, name, core, config }
},
{
@@ -47,6 +62,7 @@ export const contextModule = new Elysia({ prefix: '/context' })
name: z.string().default(''),
config: BrowserContextConfigModel.default({}),
id: z.string().default(crypto.randomUUID()),
bot_id: z.string().optional(),
}),
},
)
@@ -58,3 +74,35 @@ export const contextModule = new Elysia({ prefix: '/context' })
}
return { success: true }
})
// Export storage state (cookies + localStorage) from a context
.get('/:id/storage-state', async ({ params, set }) => {
const entry = storage.get(params.id)
if (!entry) {
set.status = 404
return { error: 'context not found' }
}
const state = await entry.context.storageState()
return state
})
// Import cookies into an existing context
.post(
'/:id/storage-state',
async ({ params, body, set }) => {
const entry = storage.get(params.id)
if (!entry) {
set.status = 404
return { error: 'context not found' }
}
if (body.cookies && Array.isArray(body.cookies)) {
await entry.context.addCookies(body.cookies)
}
return { success: true }
},
{
body: z.object({
cookies: z.array(z.any()).optional(),
}),
},
)
+243
View File
@@ -0,0 +1,243 @@
import { Elysia } from 'elysia'
import { z } from 'zod'
import { ensureBotRemoteServer } from '../browser'
import type { BrowserCore } from '../browser'
// --- Session types ---
export interface RemotePlaywrightSession {
id: string
botId: string
core: BrowserCore
wsEndpoint: string
sessionToken: string
playwrightVersion: string
contextConfig?: Record<string, unknown>
createdAt: Date
expiresAt: Date
lastSeenAt: Date
status: 'active' | 'expired' | 'closed'
}
// --- Session storage ---
const sessions = new Map<string, RemotePlaywrightSession>()
// Per-bot in-flight creation promises to prevent duplicate launches
const inflightCreations = new Map<string, Promise<string>>()
const SESSION_DEFAULT_TTL_MS = 30 * 60 * 1000 // 30 minutes
const SESSION_MAX_TTL_MS = 2 * 60 * 60 * 1000 // 2 hours
function getPlaywrightVersion(): string {
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const pkg = require('playwright/package.json') as { version: string }
return pkg.version
} catch {
return 'unknown'
}
}
function generateToken(): string {
const bytes = new Uint8Array(32)
crypto.getRandomValues(bytes)
return Array.from(bytes, b => b.toString(16).padStart(2, '0')).join('')
}
// --- Janitor ---
let janitorHandle: ReturnType<typeof setInterval> | null = null
const JANITOR_INTERVAL_MS = 60_000
function startJanitor() {
if (janitorHandle) return
janitorHandle = setInterval(() => {
const now = new Date()
for (const [id, session] of sessions) {
if (session.status !== 'active') continue
if (now > session.expiresAt) {
session.status = 'expired'
sessions.delete(id)
console.log(`Session ${id} expired (bot: ${session.botId})`)
}
}
}, JANITOR_INTERVAL_MS)
}
startJanitor()
// --- Helper to validate session token ---
function validateSessionToken(sessionId: string, token: string): RemotePlaywrightSession | null {
const session = sessions.get(sessionId)
if (!session) return null
if (session.status !== 'active') return null
if (session.sessionToken !== token) return null
if (new Date() > session.expiresAt) return null
return session
}
// Deduplicated remote server creation
async function getOrCreateRemoteServer(botId: string, core: BrowserCore): Promise<string> {
const existing = inflightCreations.get(botId)
if (existing) return existing
const promise = ensureBotRemoteServer(botId, core).finally(() => {
inflightCreations.delete(botId)
})
inflightCreations.set(botId, promise)
return promise
}
// --- Elysia module ---
//
// Remote sessions give the client a WS endpoint to a dedicated per-bot
// Playwright server. The client gets full native Playwright API access —
// they create their own contexts, pages, cookies, etc. The gateway only
// tracks session lifecycle metadata (expiry, auth token).
export const sessionModule = new Elysia({ prefix: '/session' })
// Create a remote Playwright session
.post(
'/',
async ({ body, set }) => {
const { bot_id, core, ttl_ms, context_config } = body
const sessionCore = core ?? 'chromium'
// Launch or reuse the bot's remote Playwright server (Node child process)
const wsEndpoint = await getOrCreateRemoteServer(bot_id, sessionCore)
const sessionId = crypto.randomUUID()
const sessionToken = generateToken()
const ttl = Math.min(ttl_ms ?? SESSION_DEFAULT_TTL_MS, SESSION_MAX_TTL_MS)
const now = new Date()
const session: RemotePlaywrightSession = {
id: sessionId,
botId: bot_id,
core: sessionCore,
wsEndpoint,
sessionToken,
playwrightVersion: getPlaywrightVersion(),
contextConfig: context_config,
createdAt: now,
expiresAt: new Date(now.getTime() + ttl),
lastSeenAt: now,
status: 'active',
}
sessions.set(sessionId, session)
set.status = 201
console.log(`Created remote session ${sessionId} for bot ${bot_id} (core: ${sessionCore}, expires: ${session.expiresAt.toISOString()})`)
return {
id: sessionId,
ws_endpoint: wsEndpoint,
session_token: sessionToken,
playwright_version: session.playwrightVersion,
core: sessionCore,
context_config: context_config ?? {},
expires_at: session.expiresAt.toISOString(),
}
},
{
body: z.object({
bot_id: z.string(),
core: z.enum(['chromium', 'firefox']).optional(),
ttl_ms: z.number().optional(),
context_config: z.record(z.string(), z.any()).optional(),
}),
},
)
// Get session metadata
.get(
'/:id',
({ params, query, set }) => {
const session = validateSessionToken(params.id, query.token ?? '')
if (!session) {
set.status = 404
return { error: 'session not found or invalid token' }
}
return {
id: session.id,
bot_id: session.botId,
core: session.core,
ws_endpoint: session.wsEndpoint,
status: session.status,
playwright_version: session.playwrightVersion,
context_config: session.contextConfig ?? {},
created_at: session.createdAt.toISOString(),
expires_at: session.expiresAt.toISOString(),
last_seen_at: session.lastSeenAt.toISOString(),
}
},
{
query: z.object({ token: z.string().optional() }),
},
)
// Close session
.delete(
'/:id',
({ params, query, set }) => {
const session = validateSessionToken(params.id, query.token ?? '')
if (!session) {
set.status = 404
return { error: 'session not found or invalid token' }
}
session.status = 'closed'
sessions.delete(session.id)
console.log(`Closed remote session ${session.id} (bot: ${session.botId})`)
return { success: true }
},
{
query: z.object({ token: z.string().optional() }),
},
)
// Heartbeat — extend session lifetime
.post(
'/:id/heartbeat',
({ params, query, set }) => {
const session = validateSessionToken(params.id, query.token ?? '')
if (!session) {
set.status = 404
return { error: 'session not found or invalid token' }
}
const now = new Date()
const extension = Math.min(SESSION_DEFAULT_TTL_MS, SESSION_MAX_TTL_MS - (now.getTime() - session.createdAt.getTime()))
if (extension > 0) {
session.expiresAt = new Date(now.getTime() + extension)
}
session.lastSeenAt = now
return {
expires_at: session.expiresAt.toISOString(),
remaining_ms: session.expiresAt.getTime() - now.getTime(),
}
},
{
query: z.object({ token: z.string().optional() }),
},
)
// --- Exports for shutdown ---
export function getActiveSessions(): Map<string, RemotePlaywrightSession> {
return sessions
}
export async function closeAllSessions(): Promise<void> {
sessions.clear()
if (janitorHandle) {
clearInterval(janitorHandle)
janitorHandle = null
}
}
+1
View File
@@ -5,6 +5,7 @@ import type { BrowserCore } from '../browser'
export interface GatewayBrowserContext {
id: string
name: string
botId?: string
core: BrowserCore
context: BrowserContext
config: BrowserContextConfig
+136 -3
View File
@@ -103,6 +103,23 @@ func (p *BrowserProvider) Tools(ctx context.Context, session SessionContext) ([]
return p.execObserve(ctx.Context, sess, inputAsMap(input))
},
},
{
Name: "browser_remote_session",
Description: "Manage a remote native Playwright session for full browser automation. Use 'create' to get a WebSocket endpoint that a Python Playwright client can connect to with full API access (including HttpOnly cookies, storage state, route interception, etc). Use 'close' to terminate a session. Use 'status' to check a session.",
Parameters: map[string]any{
"type": "object",
"properties": map[string]any{
"action": map[string]any{"type": "string", "enum": []string{"create", "close", "status"}, "description": "The session action to perform"},
"session_id": map[string]any{"type": "string", "description": "Session ID (required for close and status)"},
"session_token": map[string]any{"type": "string", "description": "Session token (required for close and status, returned by create)"},
"core": map[string]any{"type": "string", "enum": []string{"chromium", "firefox"}, "description": "Browser core to use (for create, default: chromium)"},
},
"required": []string{"action"},
},
Execute: func(ctx *sdk.ToolExecContext, input any) (any, error) {
return p.execRemoteSession(ctx.Context, sess, inputAsMap(input))
},
},
}, nil
}
@@ -119,7 +136,7 @@ func (p *BrowserProvider) resolveContext(ctx context.Context, botID string) (str
if err != nil {
return "", browsercontexts.BrowserContext{}, fmt.Errorf("failed to load browser context config: %s", err.Error())
}
if err := p.ensureContext(ctx, browserCtxID, bcConfig); err != nil {
if err := p.ensureContext(ctx, botID, browserCtxID, bcConfig); err != nil {
return "", browsercontexts.BrowserContext{}, fmt.Errorf("failed to ensure browser context: %s", err.Error())
}
return browserCtxID, bcConfig, nil
@@ -185,7 +202,7 @@ func (p *BrowserProvider) execObserve(ctx context.Context, session SessionContex
return p.doGatewayAction(ctx, botID, contextID, payload)
}
func (p *BrowserProvider) ensureContext(ctx context.Context, contextID string, bc browsercontexts.BrowserContext) error {
func (p *BrowserProvider) ensureContext(ctx context.Context, botID, contextID string, bc browsercontexts.BrowserContext) error {
existsURL := fmt.Sprintf("%s/context/%s/exists", p.gatewayBaseURL, contextID)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, existsURL, nil)
if err != nil {
@@ -206,7 +223,7 @@ func (p *BrowserProvider) ensureContext(ctx context.Context, contextID string, b
if existsResp.Exists {
return nil
}
createPayload, _ := json.Marshal(map[string]any{"id": contextID, "name": bc.Name, "config": bc.Config})
createPayload, _ := json.Marshal(map[string]any{"id": contextID, "name": bc.Name, "config": bc.Config, "bot_id": botID})
createURL := fmt.Sprintf("%s/context", p.gatewayBaseURL)
createReq, err := http.NewRequestWithContext(ctx, http.MethodPost, createURL, bytes.NewReader(createPayload))
if err != nil {
@@ -300,3 +317,119 @@ func (p *BrowserProvider) buildScreenshotResult(ctx context.Context, botID, base
},
}
}
func (p *BrowserProvider) execRemoteSession(ctx context.Context, session SessionContext, args map[string]any) (any, error) {
botID := strings.TrimSpace(session.BotID)
if botID == "" {
return nil, errors.New("bot_id is required")
}
// Same access gate as browser_action/browser_observe
_, bcConfig, err := p.resolveContext(ctx, botID)
if err != nil {
return nil, err
}
action := StringArg(args, "action")
switch action {
case "create":
return p.createRemoteSession(ctx, botID, bcConfig, args)
case "close":
sessionID := StringArg(args, "session_id")
sessionToken := StringArg(args, "session_token")
if sessionID == "" {
return nil, errors.New("session_id is required for close")
}
if sessionToken == "" {
return nil, errors.New("session_token is required for close")
}
return p.closeRemoteSession(ctx, sessionID, sessionToken)
case "status":
sessionID := StringArg(args, "session_id")
sessionToken := StringArg(args, "session_token")
if sessionID == "" {
return nil, errors.New("session_id is required for status")
}
if sessionToken == "" {
return nil, errors.New("session_token is required for status")
}
return p.getRemoteSessionStatus(ctx, sessionID, sessionToken)
default:
return nil, fmt.Errorf("unknown session action: %s", action)
}
}
func (p *BrowserProvider) createRemoteSession(ctx context.Context, botID string, bcConfig browsercontexts.BrowserContext, args map[string]any) (any, error) {
core := StringArg(args, "core")
if core == "" {
core = "chromium"
}
payload, _ := json.Marshal(map[string]any{
"bot_id": botID,
"core": core,
"context_config": bcConfig.Config,
})
url := fmt.Sprintf("%s/session", p.gatewayBaseURL)
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(payload))
if err != nil {
return nil, err
}
req.Header.Set("Content-Type", "application/json")
resp, err := p.httpClient.Do(req) //nolint:gosec
if err != nil {
return nil, fmt.Errorf("failed to create remote session: %w", err)
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("create session failed (HTTP %d): %s", resp.StatusCode, string(body))
}
var result map[string]any
if err := json.Unmarshal(body, &result); err != nil {
return nil, errors.New("invalid session response")
}
return result, nil
}
func (p *BrowserProvider) closeRemoteSession(ctx context.Context, sessionID, sessionToken string) (any, error) {
reqURL := fmt.Sprintf("%s/session/%s?token=%s", p.gatewayBaseURL, sessionID, sessionToken)
req, err := http.NewRequestWithContext(ctx, http.MethodDelete, reqURL, nil)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Do(req) //nolint:gosec
if err != nil {
return nil, fmt.Errorf("failed to close remote session: %w", err)
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("close session failed (HTTP %d): %s", resp.StatusCode, string(body))
}
var result map[string]any
if err := json.Unmarshal(body, &result); err != nil {
return nil, errors.New("invalid session response")
}
return result, nil
}
func (p *BrowserProvider) getRemoteSessionStatus(ctx context.Context, sessionID, sessionToken string) (any, error) {
reqURL := fmt.Sprintf("%s/session/%s?token=%s", p.gatewayBaseURL, sessionID, sessionToken)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil)
if err != nil {
return nil, err
}
resp, err := p.httpClient.Do(req) //nolint:gosec
if err != nil {
return nil, fmt.Errorf("failed to get remote session status: %w", err)
}
defer func() { _ = resp.Body.Close() }()
body, _ := io.ReadAll(resp.Body)
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("get session status failed (HTTP %d): %s", resp.StatusCode, string(body))
}
var result map[string]any
if err := json.Unmarshal(body, &result); err != nil {
return nil, errors.New("invalid session response")
}
return result, nil
}