Skip to content

Commit

Permalink
fix: Add region to healthcheck; Better failure flow on Connect (#1220)
Browse files Browse the repository at this point in the history
  • Loading branch information
filipecabaco authored Nov 13, 2024
1 parent 5dafe26 commit 9b1cfe1
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 20 deletions.
49 changes: 44 additions & 5 deletions lib/realtime/tenants.ex
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,36 @@ defmodule Realtime.Tenants do
{:error,
:tenant_not_found
| String.t()
| %{connected_cluster: pos_integer, db_connected: false, healthy: false}}
| {:ok, %{connected_cluster: non_neg_integer, db_connected: true, healthy: true}}
| %{
connected_cluster: pos_integer,
db_connected: false,
healthy: false,
region: String.t(),
node: String.t()
}}
| {:ok,
%{
connected_cluster: non_neg_integer,
db_connected: true,
healthy: true,
region: String.t(),
node: String.t()
}}
def health_check(external_id) when is_binary(external_id) do
region = Application.get_env(:realtime, :region)
node = Node.self() |> to_string()

with %Tenant{} = tenant <- Cache.get_tenant_by_external_id(external_id),
{:error, _} <- get_health_conn(tenant),
connected_cluster when connected_cluster > 0 <- UsersCounter.tenant_users(external_id) do
{:error, %{healthy: false, db_connected: false, connected_cluster: connected_cluster}}
{:error,
%{
healthy: false,
db_connected: false,
connected_cluster: connected_cluster,
region: region,
node: node
}}
else
nil ->
{:error, :tenant_not_found}
Expand All @@ -70,14 +93,30 @@ defmodule Realtime.Tenants do
connected_cluster = UsersCounter.tenant_users(external_id)
tenant = Cache.get_tenant_by_external_id(external_id)
Migrations.maybe_run_migrations(health_conn, tenant)
{:ok, %{healthy: true, db_connected: true, connected_cluster: connected_cluster}}

{:ok,
%{
healthy: true,
db_connected: true,
connected_cluster: connected_cluster,
region: region,
node: node
}}

connected_cluster when is_integer(connected_cluster) ->
tenant = Cache.get_tenant_by_external_id(external_id)
{:ok, db_conn} = Database.connect(tenant, "realtime_health_check", 1)
Migrations.maybe_run_migrations(db_conn, tenant)
Process.alive?(db_conn) && GenServer.stop(db_conn)
{:ok, %{healthy: true, db_connected: false, connected_cluster: connected_cluster}}

{:ok,
%{
healthy: true,
db_connected: false,
connected_cluster: connected_cluster,
region: region,
node: node
}}
end
end

Expand Down
15 changes: 10 additions & 5 deletions lib/realtime/tenants/connect.ex
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,17 @@ defmodule Realtime.Tenants.Connect do

def handle_continue(:run_migrations, state) do
%{tenant: tenant, db_conn_pid: db_conn_pid} = state
:ok = Migrations.maybe_run_migrations(db_conn_pid, tenant)
:ok = Migrations.create_partitions(db_conn_pid)
{:ok, broadcast_changes_pid} = start_replication(tenant)

{:noreply, %{state | broadcast_changes_pid: broadcast_changes_pid},
{:continue, :setup_connected_user_events}}
with :ok <- Migrations.maybe_run_migrations(db_conn_pid, tenant),
:ok <- Migrations.create_partitions(db_conn_pid),
{:ok, broadcast_changes_pid} <- start_replication(tenant) do
{:noreply, %{state | broadcast_changes_pid: broadcast_changes_pid},
{:continue, :setup_connected_user_events}}
else
error ->
log_error("MigrationsFailedToRun", error)
{:stop, :shutdown}
end
end

@impl true
Expand Down
4 changes: 2 additions & 2 deletions lib/realtime_web/templates/layout/live.html.heex
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@
</header>
<main class="px-4 py-24 sm:px-6 lg:px-8">
<div class="mx-auto max-w-6xl">
<p class="alert alert-info" role="alert"><%= live_flash(@flash, :info) %></p>
<p class="alert alert-danger" role="alert"><%= live_flash(@flash, :error) %></p>
<p class="alert alert-info" role="alert"><%= Phoenix.Flash.get(@flash, :info) %></p>
<p class="alert alert-danger" role="alert"><%= Phoenix.Flash.get(@flash, :error) %></p>

<%= @inner_content %>
</div>
Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do
def project do
[
app: :realtime,
version: "2.33.44",
version: "2.33.45",
elixir: "~> 1.16.0",
elixirc_paths: elixirc_paths(Mix.env()),
start_permanent: Mix.env() == :prod,
Expand Down
8 changes: 4 additions & 4 deletions test/realtime/tenants/connect_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ defmodule Realtime.Tenants.ConnectTest do

test "on database disconnect, returns new connection", %{tenant: tenant} do
assert {:ok, old_conn} = Connect.lookup_or_start_connection(tenant.external_id)

:timer.sleep(500)
GenServer.stop(old_conn)
:timer.sleep(500)

Expand Down Expand Up @@ -75,7 +75,7 @@ defmodule Realtime.Tenants.ConnectTest do
tenant: %{external_id: tenant_id}
} do
{:ok, db_conn} =
Connect.lookup_or_start_connection(tenant_id, check_connected_user_interval: 50)
Connect.lookup_or_start_connection(tenant_id, check_connected_user_interval: 200)

Sandbox.allow(Repo, self(), db_conn)
# Not enough time has passed, connection still alive
Expand Down Expand Up @@ -184,8 +184,8 @@ defmodule Realtime.Tenants.ConnectTest do
with_mock Realtime.Tenants.Migrations, [],
maybe_run_migrations: fn _, _ -> raise("error") end do
assert {:ok, pid} = Connect.lookup_or_start_connection(tenant.external_id)
Process.alive?(pid)
Process.sleep(1000)
Process.sleep(200)
refute Process.alive?(pid)
assert_called(Realtime.Tenants.Migrations.maybe_run_migrations(:_, :_))
end
end
Expand Down
30 changes: 27 additions & 3 deletions test/realtime_web/controllers/tenant_controller_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,11 @@ defmodule RealtimeWeb.TenantControllerTest do
describe "health check tenant" do
setup [:create_tenant]

setup do
Application.put_env(:realtime, :region, "us-east-1")
on_exit(fn -> Application.put_env(:realtime, :region, nil) end)
end

test "health check when tenant does not exist", %{conn: conn} do
with_mock ChannelsAuthorization, authorize: fn _, _, _ -> {:ok, %{}} end do
Routes.tenant_path(conn, :reload, "wrong_external_id")
Expand All @@ -216,7 +221,14 @@ defmodule RealtimeWeb.TenantControllerTest do
with_mock JwtVerification, verify: fn _token, _secret, _jwks -> {:ok, %{}} end do
conn = get(conn, Routes.tenant_path(conn, :health, ext_id))
data = json_response(conn, 200)["data"]
assert %{"healthy" => true, "db_connected" => false, "connected_cluster" => 0} = data

assert %{
"healthy" => true,
"db_connected" => false,
"connected_cluster" => 0,
"region" => "us-east-1",
"node" => "nonode@nohost"
} == data
end
end

Expand All @@ -232,7 +244,13 @@ defmodule RealtimeWeb.TenantControllerTest do
conn = get(conn, Routes.tenant_path(conn, :health, ext_id))
data = json_response(conn, 200)["data"]

assert %{"healthy" => false, "db_connected" => false, "connected_cluster" => 1} = data
assert %{
"healthy" => false,
"db_connected" => false,
"connected_cluster" => 1,
"region" => "us-east-1",
"node" => "nonode@nohost"
} == data
end
end

Expand All @@ -255,7 +273,13 @@ defmodule RealtimeWeb.TenantControllerTest do
conn = get(conn, Routes.tenant_path(conn, :health, ext_id))
data = json_response(conn, 200)["data"]

assert %{"healthy" => true, "db_connected" => true, "connected_cluster" => 1} = data
assert %{
"healthy" => true,
"db_connected" => true,
"connected_cluster" => 1,
"region" => "us-east-1",
"node" => "nonode@nohost"
} == data
end
end

Expand Down

0 comments on commit 9b1cfe1

Please sign in to comment.