Adding a scaled sort, to boost smaller communities. (#3907)

* Adding a scaled sort, to boost smaller communities. - Previously referred to as *best* . - Fixes #3622 * Fixing scheduled task update. * Converting hot_rank integers to floats. * Altering hot_rank psql function to default to zero after a week. * Setting scaled_rank to zero, where hot_rank is zero. * Adding image_upload table.
2023-09-06 13:43:27 -04:00 · 2023-09-06 13:43:27 -04:00 · 9785b20843
parent 4121fc4d56
commit 9785b20843
13 changed files with 279 additions and 31 deletions
--- a/crates/apub/src/activities/create_or_update/post.rs
+++ b/crates/apub/src/activities/create_or_update/post.rs
@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage {
    PostLike::like(&mut context.pool(), &like_form).await?;

    // Calculate initial hot_rank for post
-    PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?;
+    PostAggregates::update_ranks(&mut context.pool(), post.id).await?;

    Ok(())
  }
--- a/crates/db_schema/src/aggregates/post_aggregates.rs
+++ b/crates/db_schema/src/aggregates/post_aggregates.rs
@ -1,10 +1,14 @@
 use crate::{
  aggregates::structs::PostAggregates,
  newtypes::PostId,
-  schema::post_aggregates,
-  utils::{functions::hot_rank, get_conn, DbPool},
+  schema::{community_aggregates, post, post_aggregates},
+  utils::{
+    functions::{hot_rank, scaled_rank},
+    get_conn,
+    DbPool,
+  },
 };
-use diesel::{result::Error, ExpressionMethods, QueryDsl};
+use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl};
 use diesel_async::RunQueryDsl;

 impl PostAggregates {
@ -16,9 +20,19 @@ impl PostAggregates {
      .await
  }

-  pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
+  pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
    let conn = &mut get_conn(pool).await?;

+    // Diesel can't update based on a join, which is necessary for the scaled_rank
+    // https://github.com/diesel-rs/diesel/issues/1478
+    // Just select the users_active_month manually for now, since its a single post anyway
+    let users_active_month = community_aggregates::table
+      .select(community_aggregates::users_active_month)
+      .inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
+      .filter(post::id.eq(post_id))
+      .first::<i64>(conn)
+      .await?;
+
    diesel::update(post_aggregates::table)
      .filter(post_aggregates::post_id.eq(post_id))
      .set((
@ -27,6 +41,11 @@ impl PostAggregates {
          post_aggregates::score,
          post_aggregates::newest_comment_time_necro,
        )),
+        post_aggregates::scaled_rank.eq(scaled_rank(
+          post_aggregates::score,
+          post_aggregates::published,
+          users_active_month,
+        )),
      ))
      .get_result::<Self>(conn)
      .await
--- a/crates/db_schema/src/aggregates/structs.rs
+++ b/crates/db_schema/src/aggregates/structs.rs
@ -27,11 +27,11 @@ pub struct CommentAggregates {
  pub published: DateTime<Utc>,
  /// The total number of children in this comment branch.
  pub child_count: i32,
-  pub hot_rank: i32,
+  pub hot_rank: f64,
  pub controversy_rank: f64,
 }

-#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
+#[derive(PartialEq, Debug, Serialize, Deserialize, Clone)]
 #[cfg_attr(feature = "full", derive(Queryable, Associations, Identifiable, TS))]
 #[cfg_attr(feature = "full", diesel(table_name = community_aggregates))]
 #[cfg_attr(
@ -55,7 +55,7 @@ pub struct CommunityAggregates {
  pub users_active_month: i64,
  /// The number of users with any activity in the last year.
  pub users_active_half_year: i64,
-  pub hot_rank: i32,
+  pub hot_rank: f64,
 }

 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone, Default)]
@ -95,11 +95,13 @@ pub struct PostAggregates {
  pub featured_community: bool,
  /// If the post is featured on the site / to local.
  pub featured_local: bool,
-  pub hot_rank: i32,
-  pub hot_rank_active: i32,
+  pub hot_rank: f64,
+  pub hot_rank_active: f64,
  pub community_id: CommunityId,
  pub creator_id: PersonId,
  pub controversy_rank: f64,
+  /// A rank that amplifies smaller communities
+  pub scaled_rank: f64,
 }

 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
--- a/crates/db_schema/src/lib.rs
+++ b/crates/db_schema/src/lib.rs
@ -54,6 +54,7 @@ use ts_rs::TS;
 )]
 #[cfg_attr(feature = "full", DbValueStyle = "verbatim")]
 #[cfg_attr(feature = "full", ts(export))]
+// TODO add the controversial and scaled rankings to the doc below
 /// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html
 pub enum SortType {
  #[default]
@ -75,6 +76,7 @@ pub enum SortType {
  TopSixMonths,
  TopNineMonths,
  Controversial,
+  Scaled,
 }

 #[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)]
--- a/crates/db_schema/src/schema.rs
+++ b/crates/db_schema/src/schema.rs
@ -100,7 +100,7 @@ diesel::table! {
        downvotes -> Int8,
        published -> Timestamptz,
        child_count -> Int4,
-        hot_rank -> Int4,
+        hot_rank -> Float8,
        controversy_rank -> Float8,
    }
 }
@ -198,7 +198,7 @@ diesel::table! {
        users_active_week -> Int8,
        users_active_month -> Int8,
        users_active_half_year -> Int8,
-        hot_rank -> Int4,
+        hot_rank -> Float8,
    }
 }

@ -299,6 +299,16 @@ diesel::table! {
    }
 }

+diesel::table! {
+    image_upload (id) {
+        id -> Int4,
+        local_user_id -> Int4,
+        pictrs_alias -> Text,
+        pictrs_delete_token -> Text,
+        published -> Timestamptz,
+    }
+}
+
 diesel::table! {
    instance (id) {
        id -> Int4,
@ -683,11 +693,12 @@ diesel::table! {
        newest_comment_time -> Timestamptz,
        featured_community -> Bool,
        featured_local -> Bool,
-        hot_rank -> Int4,
-        hot_rank_active -> Int4,
+        hot_rank -> Float8,
+        hot_rank_active -> Float8,
        community_id -> Int4,
        creator_id -> Int4,
        controversy_rank -> Float8,
+        scaled_rank -> Float8,
    }
 }

@ -893,6 +904,7 @@ diesel::joinable!(custom_emoji_keyword -> custom_emoji (custom_emoji_id));
 diesel::joinable!(email_verification -> local_user (local_user_id));
 diesel::joinable!(federation_allowlist -> instance (instance_id));
 diesel::joinable!(federation_blocklist -> instance (instance_id));
+diesel::joinable!(image_upload -> local_user (local_user_id));
 diesel::joinable!(local_site -> site (site_id));
 diesel::joinable!(local_site_rate_limit -> local_site (local_site_id));
 diesel::joinable!(local_user -> person (person_id));
@ -967,6 +979,7 @@ diesel::allow_tables_to_appear_in_same_query!(
    email_verification,
    federation_allowlist,
    federation_blocklist,
+    image_upload,
    instance,
    language,
    local_site,
--- a/crates/db_schema/src/utils.rs
+++ b/crates/db_schema/src/utils.rs
@ -347,7 +347,7 @@ pub fn naive_now() -> DateTime<Utc> {

 pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType {
  match sort {
-    SortType::Active | SortType::Hot => CommentSortType::Hot,
+    SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot,
    SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New,
    SortType::Old => CommentSortType::Old,
    SortType::Controversial => CommentSortType::Controversial,
@ -384,7 +384,11 @@ pub mod functions {
  use diesel::sql_types::{BigInt, Text, Timestamptz};

  sql_function! {
-    fn hot_rank(score: BigInt, time: Timestamptz) -> Integer;
+    fn hot_rank(score: BigInt, time: Timestamptz) -> Double;
+  }
+
+  sql_function! {
+    fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double;
  }

  sql_function! {
--- a/crates/db_views/src/comment_report_view.rs
+++ b/crates/db_views/src/comment_report_view.rs
@ -432,7 +432,7 @@ mod tests {
        downvotes: 0,
        published: agg.published,
        child_count: 0,
-        hot_rank: 1728,
+        hot_rank: 0.1728,
        controversy_rank: 0.0,
      },
      my_vote: None,
--- a/crates/db_views/src/comment_view.rs
+++ b/crates/db_views/src/comment_view.rs
@ -886,7 +886,7 @@ mod tests {
        downvotes: 0,
        published: agg.published,
        child_count: 5,
-        hot_rank: 1728,
+        hot_rank: 0.1728,
        controversy_rank: 0.0,
      },
    }
--- a/crates/db_views/src/post_view.rs
+++ b/crates/db_views/src/post_view.rs
@ -380,6 +380,9 @@ fn queries<'a>() -> Queries<
      SortType::Hot => query
        .then_order_by(post_aggregates::hot_rank.desc())
        .then_order_by(post_aggregates::published.desc()),
+      SortType::Scaled => query
+        .then_order_by(post_aggregates::scaled_rank.desc())
+        .then_order_by(post_aggregates::published.desc()),
      SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()),
      SortType::New => query.then_order_by(post_aggregates::published.desc()),
      SortType::Old => query.then_order_by(post_aggregates::published.asc()),
@ -1154,9 +1157,10 @@ mod tests {
        newest_comment_time: inserted_post.published,
        featured_community: false,
        featured_local: false,
-        hot_rank: 1728,
-        hot_rank_active: 1728,
+        hot_rank: 0.1728,
+        hot_rank_active: 0.1728,
        controversy_rank: 0.0,
+        scaled_rank: 0.3621,
        community_id: inserted_post.community_id,
        creator_id: inserted_post.creator_id,
      },
--- a/crates/db_views_actor/src/community_view.rs
+++ b/crates/db_views_actor/src/community_view.rs
@ -105,7 +105,7 @@ fn queries<'a>() -> Queries<
    }

    match options.sort.unwrap_or(Hot) {
-      Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()),
+      Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()),
      NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => {
        query = query.order_by(community_aggregates::users_active_day.desc())
      }
--- a/migrations/2023-08-23-182533_scaled_rank/down.sql
+++ b/migrations/2023-08-23-182533_scaled_rank/down.sql
@ -0,0 +1,87 @@
+DROP FUNCTION scaled_rank;
+
+ALTER TABLE community_aggregates
+    ALTER COLUMN hot_rank TYPE integer,
+    ALTER COLUMN hot_rank SET DEFAULT 1728;
+
+ALTER TABLE comment_aggregates
+    ALTER COLUMN hot_rank TYPE integer,
+    ALTER COLUMN hot_rank SET DEFAULT 1728;
+
+ALTER TABLE post_aggregates
+    ALTER COLUMN hot_rank TYPE integer,
+    ALTER COLUMN hot_rank SET DEFAULT 1728,
+    ALTER COLUMN hot_rank_active TYPE integer,
+    ALTER COLUMN hot_rank_active SET DEFAULT 1728;
+
+-- Change back to integer version
+DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
+
+CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
+    RETURNS integer
+    AS $$
+DECLARE
+    hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
+BEGIN
+    IF (hours_diff > 0) THEN
+        RETURN floor(10000 * log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8))::integer;
+    ELSE
+        -- if the post is from the future, set hot score to 0. otherwise you can game the post to
+        -- always be on top even with only 1 vote by setting it to the future
+        RETURN 0;
+    END IF;
+END;
+$$
+LANGUAGE plpgsql
+IMMUTABLE PARALLEL SAFE;
+
+ALTER TABLE post_aggregates
+    DROP COLUMN scaled_rank;
+
+-- The following code is necessary because postgres can't remove
+-- a single enum value.
+ALTER TABLE local_user
+    ALTER default_sort_type DROP DEFAULT;
+
+UPDATE
+    local_user
+SET
+    default_sort_type = 'Hot'
+WHERE
+    default_sort_type = 'Scaled';
+
+-- rename the old enum
+ALTER TYPE sort_type_enum RENAME TO sort_type_enum__;
+
+-- create the new enum
+CREATE TYPE sort_type_enum AS ENUM (
+    'Active',
+    'Hot',
+    'New',
+    'Old',
+    'TopDay',
+    'TopWeek',
+    'TopMonth',
+    'TopYear',
+    'TopAll',
+    'MostComments',
+    'NewComments',
+    'TopHour',
+    'TopSixHour',
+    'TopTwelveHour',
+    'TopThreeMonths',
+    'TopSixMonths',
+    'TopNineMonths'
+);
+
+-- alter all your enum columns
+ALTER TABLE local_user
+    ALTER COLUMN default_sort_type TYPE sort_type_enum
+    USING default_sort_type::text::sort_type_enum;
+
+ALTER TABLE local_user
+    ALTER default_sort_type SET DEFAULT 'Active';
+
+-- drop the old enum
+DROP TYPE sort_type_enum__;
+
--- a/migrations/2023-08-23-182533_scaled_rank/up.sql
+++ b/migrations/2023-08-23-182533_scaled_rank/up.sql
@ -0,0 +1,74 @@
+-- Change hot ranks and functions from an int to a float
+ALTER TABLE community_aggregates
+    ALTER COLUMN hot_rank TYPE float,
+    ALTER COLUMN hot_rank SET DEFAULT 0.1728;
+
+ALTER TABLE comment_aggregates
+    ALTER COLUMN hot_rank TYPE float,
+    ALTER COLUMN hot_rank SET DEFAULT 0.1728;
+
+ALTER TABLE post_aggregates
+    ALTER COLUMN hot_rank TYPE float,
+    ALTER COLUMN hot_rank SET DEFAULT 0.1728,
+    ALTER COLUMN hot_rank_active TYPE float,
+    ALTER COLUMN hot_rank_active SET DEFAULT 0.1728;
+
+DROP FUNCTION hot_rank (numeric, published timestamp with time zone);
+
+CREATE OR REPLACE FUNCTION hot_rank (score numeric, published timestamp with time zone)
+    RETURNS float
+    AS $$
+DECLARE
+    hours_diff numeric := EXTRACT(EPOCH FROM (now() - published)) / 3600;
+BEGIN
+    -- 24 * 7 = 168, so after a week, it will default to 0.
+    IF (hours_diff > 0 AND hours_diff < 168) THEN
+        RETURN log(greatest (1, score + 3)) / power((hours_diff + 2), 1.8);
+    ELSE
+        -- if the post is from the future, set hot score to 0. otherwise you can game the post to
+        -- always be on top even with only 1 vote by setting it to the future
+        RETURN 0.0;
+    END IF;
+END;
+$$
+LANGUAGE plpgsql
+IMMUTABLE PARALLEL SAFE;
+
+-- The new scaled rank function
+CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric)
+    RETURNS float
+    AS $$
+BEGIN
+    -- Add 2 to avoid divide by zero errors
+    -- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621
+    -- There may need to be a scale factor multiplied to users_active_month, to make
+    -- the log curve less pronounced. This can be tuned in the future.
+    RETURN (hot_rank (score, published) / log(2 + users_active_month));
+END;
+$$
+LANGUAGE plpgsql
+IMMUTABLE PARALLEL SAFE;
+
+ALTER TABLE post_aggregates
+    ADD COLUMN scaled_rank float NOT NULL DEFAULT 0.3621;
+
+UPDATE
+    post_aggregates
+SET
+    scaled_rank = 0
+WHERE
+    hot_rank = 0
+    OR hot_rank_active = 0;
+
+CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC);
+
+CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC);
+
+-- We forgot to add the controversial sort type
+ALTER TYPE sort_type_enum
+    ADD VALUE 'Controversial';
+
+-- Add the Scaled enum
+ALTER TYPE sort_type_enum
+    ADD VALUE 'Scaled';
+
--- a/src/scheduled_tasks.rs
+++ b/src/scheduled_tasks.rs
@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) {
 fn update_hot_ranks(conn: &mut PgConnection) {
  info!("Updating hot ranks for all history...");

-  process_hot_ranks_in_batches(
-    conn,
-    "post_aggregates",
-    "a.hot_rank != 0 OR a.hot_rank_active != 0",
-    "SET hot_rank = hot_rank(a.score, a.published),
-         hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)",
-  );
+  process_post_aggregates_ranks_in_batches(conn);

-  process_hot_ranks_in_batches(
+  process_ranks_in_batches(
    conn,
    "comment_aggregates",
    "a.hot_rank != 0",
    "SET hot_rank = hot_rank(a.score, a.published)",
  );

-  process_hot_ranks_in_batches(
+  process_ranks_in_batches(
    conn,
    "community_aggregates",
    "a.hot_rank != 0",
@ -189,7 +183,7 @@ struct HotRanksUpdateResult {
 /// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table.
 /// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next
 /// run)
-fn process_hot_ranks_in_batches(
+fn process_ranks_in_batches(
  conn: &mut PgConnection,
  table_name: &str,
  where_clause: &str,
@ -241,6 +235,55 @@ fn process_hot_ranks_in_batches(
  );
 }

+/// Post aggregates is a special case, since it needs to join to the community_aggregates
+/// table, to get the active monthly user counts.
+fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) {
+  let process_start_time: DateTime<Utc> = Utc
+    .timestamp_opt(0, 0)
+    .single()
+    .expect("0 timestamp creation");
+
+  let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans
+  let mut processed_rows_count = 0;
+  let mut previous_batch_result = Some(process_start_time);
+  while let Some(previous_batch_last_published) = previous_batch_result {
+    let result = sql_query(
+      r#"WITH batch AS (SELECT pa.id
+               FROM post_aggregates pa
+               WHERE pa.published > $1
+               AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0)
+               ORDER BY pa.published
+               LIMIT $2
+               FOR UPDATE SKIP LOCKED)
+         UPDATE post_aggregates pa
+           SET hot_rank = hot_rank(pa.score, pa.published),
+           hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro),
+           scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month)
+         FROM batch, community_aggregates ca
+         WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published;
+    "#,
+    )
+    .bind::<Timestamptz, _>(previous_batch_last_published)
+    .bind::<Integer, _>(update_batch_size)
+    .get_results::<HotRanksUpdateResult>(conn);
+
+    match result {
+      Ok(updated_rows) => {
+        processed_rows_count += updated_rows.len();
+        previous_batch_result = updated_rows.last().map(|row| row.published);
+      }
+      Err(e) => {
+        error!("Failed to update {} hot_ranks: {}", "post_aggregates", e);
+        break;
+      }
+    }
+  }
+  info!(
+    "Finished process_hot_ranks_in_batches execution for {} (processed {} rows)",
+    "post_aggregates", processed_rows_count
+  );
+}
+
 fn delete_expired_captcha_answers(conn: &mut PgConnection) {
  diesel::delete(
    captcha_answer::table.filter(captcha_answer::published.lt(now() - IntervalDsl::minutes(10))),