Fix federate loop (#4330)

* make activity channel infallible

* clippy

* federate: make cancellabletask loop itself
reduce-pool-size^2
phiresky 2024-01-03 19:30:06 +01:00 committed by GitHub
parent 53147596b4
commit 024ab7d530
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 48 additions and 61 deletions

View File

@ -23,6 +23,7 @@ static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(5);
#[cfg(not(debug_assertions))] #[cfg(not(debug_assertions))]
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60); static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60);
#[derive(Clone)]
pub struct Opts { pub struct Opts {
/// how many processes you are starting in total /// how many processes you are starting in total
pub process_count: i32, pub process_count: i32,
@ -36,7 +37,7 @@ async fn start_stop_federation_workers(
federation_config: FederationConfig<LemmyContext>, federation_config: FederationConfig<LemmyContext>,
cancel: CancellationToken, cancel: CancellationToken,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut workers = HashMap::<InstanceId, CancellableTask<_>>::new(); let mut workers = HashMap::<InstanceId, CancellableTask>::new();
let (stats_sender, stats_receiver) = unbounded_channel(); let (stats_sender, stats_receiver) = unbounded_channel();
let exit_print = tokio::spawn(receive_print_stats(pool.clone(), stats_receiver)); let exit_print = tokio::spawn(receive_print_stats(pool.clone(), stats_receiver));
@ -66,40 +67,30 @@ async fn start_stop_federation_workers(
let should_federate = allowed && !is_dead; let should_federate = allowed && !is_dead;
if should_federate { if should_federate {
if workers.contains_key(&instance.id) { if workers.contains_key(&instance.id) {
if workers // worker already running
.get(&instance.id) continue;
.map(util::CancellableTask::has_ended)
.unwrap_or(false)
{
// task must have errored out, remove and recreated it
let worker = workers
.remove(&instance.id)
.expect("just checked contains_key");
tracing::error!(
"worker for {} has stopped, recreating: {:?}",
instance.domain,
worker.cancel().await
);
} else {
continue;
}
} }
// create new worker // create new worker
let config = federation_config.clone();
let stats_sender = stats_sender.clone(); let stats_sender = stats_sender.clone();
let context = federation_config.to_request_data();
let pool = pool.clone(); let pool = pool.clone();
workers.insert( workers.insert(
instance.id, instance.id,
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, |stop| async move { CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |stop| {
InstanceWorker::init_and_loop( let instance = instance.clone();
instance, let req_data = config.clone().to_request_data();
context, let stats_sender = stats_sender.clone();
&mut DbPool::Pool(&pool), let pool = pool.clone();
stop, async move {
stats_sender, InstanceWorker::init_and_loop(
) instance,
.await?; req_data,
Ok(()) &mut DbPool::Pool(&pool),
stop,
stats_sender,
)
.await
}
}), }),
); );
} else if !should_federate { } else if !should_federate {
@ -135,9 +126,12 @@ pub fn start_stop_federation_workers_cancellable(
opts: Opts, opts: Opts,
pool: ActualDbPool, pool: ActualDbPool,
config: FederationConfig<LemmyContext>, config: FederationConfig<LemmyContext>,
) -> CancellableTask<()> { ) -> CancellableTask {
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |c| { CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |stop| {
start_stop_federation_workers(opts, pool, config, c) let opts = opts.clone();
let pool = pool.clone();
let config = config.clone();
async move { start_stop_federation_workers(opts, pool, config, stop).await }
}) })
} }

View File

@ -20,12 +20,7 @@ use moka::future::Cache;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use reqwest::Url; use reqwest::Url;
use serde_json::Value; use serde_json::Value;
use std::{ use std::{fmt::Debug, future::Future, pin::Pin, sync::Arc, time::Duration};
future::Future,
pin::Pin,
sync::{Arc, RwLock},
time::Duration,
};
use tokio::{task::JoinHandle, time::sleep}; use tokio::{task::JoinHandle, time::sleep};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
@ -49,41 +44,41 @@ pub(crate) static WORK_FINISHED_RECHECK_DELAY: Lazy<Duration> = Lazy::new(|| {
} }
}); });
pub struct CancellableTask<R: Send + 'static> { /// A task that will be run in an infinite loop, unless it is cancelled.
f: Pin<Box<dyn Future<Output = Result<R, anyhow::Error>> + Send + 'static>>, /// If the task exits without being cancelled, an error will be logged and the task will be restarted.
ended: Arc<RwLock<bool>>, pub struct CancellableTask {
f: Pin<Box<dyn Future<Output = Result<(), anyhow::Error>> + Send + 'static>>,
} }
impl<R: Send + 'static> CancellableTask<R> { impl CancellableTask {
/// spawn a task but with graceful shutdown /// spawn a task but with graceful shutdown
pub fn spawn<F>( pub fn spawn<F, R: Debug>(
timeout: Duration, timeout: Duration,
task: impl FnOnce(CancellationToken) -> F, task: impl Fn(CancellationToken) -> F + Send + 'static,
) -> CancellableTask<R> ) -> CancellableTask
where where
F: Future<Output = Result<R>> + Send + 'static, F: Future<Output = R> + Send + 'static,
{ {
let stop = CancellationToken::new(); let stop = CancellationToken::new();
let task = task(stop.clone()); let stop2 = stop.clone();
let ended = Arc::new(RwLock::new(false)); let task: JoinHandle<()> = tokio::spawn(async move {
let ended_write = ended.clone(); loop {
let task: JoinHandle<Result<R>> = tokio::spawn(async move { let res = task(stop2.clone()).await;
match task.await { if stop2.is_cancelled() {
Ok(o) => Ok(o), return;
Err(e) => { } else {
*ended_write.write().expect("poisoned") = true; tracing::warn!("task exited, restarting: {res:?}");
Err(e)
} }
} }
}); });
let abort = task.abort_handle(); let abort = task.abort_handle();
CancellableTask { CancellableTask {
ended,
f: Box::pin(async move { f: Box::pin(async move {
stop.cancel(); stop.cancel();
tokio::select! { tokio::select! {
r = task => { r = task => {
Ok(r.context("could not join")??) r.context("could not join")?;
Ok(())
}, },
_ = sleep(timeout) => { _ = sleep(timeout) => {
abort.abort(); abort.abort();
@ -96,12 +91,9 @@ impl<R: Send + 'static> CancellableTask<R> {
} }
/// cancel the cancel signal, wait for timeout for the task to stop gracefully, otherwise abort it /// cancel the cancel signal, wait for timeout for the task to stop gracefully, otherwise abort it
pub async fn cancel(self) -> Result<R, anyhow::Error> { pub async fn cancel(self) -> Result<(), anyhow::Error> {
self.f.await self.f.await
} }
pub fn has_ended(&self) -> bool {
*self.ended.read().expect("poisoned")
}
} }
/// assuming apub priv key and ids are immutable, then we don't need to have TTL /// assuming apub priv key and ids are immutable, then we don't need to have TTL

View File

@ -206,6 +206,7 @@ impl InstanceWorker {
.await .await
.context("failed figuring out inbox urls")?; .context("failed figuring out inbox urls")?;
if inbox_urls.is_empty() { if inbox_urls.is_empty() {
tracing::debug!("{}: {:?} no inboxes", self.instance.domain, activity.id);
self.state.last_successful_id = Some(activity.id); self.state.last_successful_id = Some(activity.id);
self.state.last_successful_published_time = Some(activity.published); self.state.last_successful_published_time = Some(activity.published);
return Ok(()); return Ok(());