From 62c8ac1db50f25c96fc1010b4f41d9eb450543f0 Mon Sep 17 00:00:00 2001 From: Otto Rottier Date: Mon, 26 Jun 2023 15:07:57 +0200 Subject: [PATCH] Site Metadata: resolve relative URLs for embedded images/videos (#3338) * Site Metadata: resolve relative URLs for embedded images/videos * api_common: relax version requirement of `webpage` dependency With this change we opt into next (non breaking) versions of webpage-rs * cargo +nightly fmt * Add tests for resolving absolute urls in SiteMetadata --- crates/api_common/Cargo.toml | 2 +- crates/api_common/src/request.rs | 59 +++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/crates/api_common/Cargo.toml b/crates/api_common/Cargo.toml index 339d233a1..a9b2bf19b 100644 --- a/crates/api_common/Cargo.toml +++ b/crates/api_common/Cargo.toml @@ -33,7 +33,7 @@ reqwest-middleware = { workspace = true, optional = true } regex = { workspace = true } rosetta-i18n = { workspace = true, optional = true } percent-encoding = { workspace = true, optional = true } -webpage = { version = "1.6.0", default-features = false, features = ["serde"], optional = true } +webpage = { version = "1.6", default-features = false, features = ["serde"], optional = true } encoding = { version = "0.2.33", optional = true } anyhow = { workspace = true } futures = { workspace = true } diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs index 3139193a6..9f7f9db59 100644 --- a/crates/api_common/src/request.rs +++ b/crates/api_common/src/request.rs @@ -27,12 +27,12 @@ pub async fn fetch_site_metadata( // https://github.com/LemmyNet/lemmy/issues/1964 let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec(); - let tags = html_to_site_metadata(&html_bytes)?; + let tags = html_to_site_metadata(&html_bytes, url)?; Ok(tags) } -fn html_to_site_metadata(html_bytes: &[u8]) -> Result { +fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result { let html = String::from_utf8_lossy(html_bytes); // Make sure the first line is doctype html @@ -81,12 +81,14 @@ fn html_to_site_metadata(html_bytes: &[u8]) -> Result .opengraph .images .first() - .and_then(|ogo| Url::parse(&ogo.url).ok()); + // join also works if the target URL is absolute + .and_then(|ogo| url.join(&ogo.url).ok()); let og_embed_url = page .opengraph .videos .first() - .and_then(|v| Url::parse(&v.url).ok()); + // join also works if the target URL is absolute + .and_then(|v| url.join(&v.url).ok()); Ok(SiteMetadata { title: og_title.or(page_title), @@ -266,7 +268,12 @@ pub fn build_user_agent(settings: &Settings) -> String { #[cfg(test)] mod tests { - use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata}; + use crate::request::{ + build_user_agent, + fetch_site_metadata, + html_to_site_metadata, + SiteMetadata, + }; use lemmy_utils::settings::SETTINGS; use url::Url; @@ -305,4 +312,46 @@ mod tests { // let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu"); // assert!(res_other.is_err()); // } + + #[test] + fn test_resolve_image_url() { + // url that lists the opengraph fields + let url = Url::parse("https://example.com/one/two.html").unwrap(); + + // root relative url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some(Url::parse("https://example.com/image.jpg").unwrap().into()) + ); + + // base relative url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some( + Url::parse("https://example.com/one/image.jpg") + .unwrap() + .into() + ) + ); + + // absolute url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into()) + ); + + // protocol relative url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some(Url::parse("https://example.com/image.jpg").unwrap().into()) + ); + } }