From ad4715c2a3efeca517b69e2ad4dbc03a19f54818 Mon Sep 17 00:00:00 2001 From: Dessalines Date: Mon, 20 Dec 2021 16:57:36 -0500 Subject: [PATCH] Trying another tokenizer fix. #1964 --- crates/utils/src/request.rs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/crates/utils/src/request.rs b/crates/utils/src/request.rs index 10341e801..081eaa2b0 100644 --- a/crates/utils/src/request.rs +++ b/crates/utils/src/request.rs @@ -5,7 +5,7 @@ use reqwest_middleware::ClientWithMiddleware; use serde::{Deserialize, Serialize}; use std::future::Future; use thiserror::Error; -use tracing::error; +use tracing::{error, info}; use url::Url; use webpage::HTML; @@ -64,12 +64,18 @@ pub async fn fetch_site_metadata( client: &ClientWithMiddleware, url: &Url, ) -> Result { + info!("Fetching site metadata for url: {}", url); let response = client.get(url.as_str()).send().await?; - let html = response - .text() + // Can't use .text() here, because it only checks the content header, not the actual bytes + // https://github.com/LemmyNet/lemmy/issues/1964 + let html_bytes = response + .bytes() .await - .map_err(|e| RecvError(e.to_string()))?; + .map_err(|e| RecvError(e.to_string()))? + .to_vec(); + + let html = String::from_utf8_lossy(&html_bytes); let tags = html_to_site_metadata(&html)?; @@ -77,6 +83,20 @@ pub async fn fetch_site_metadata( } fn html_to_site_metadata(html: &str) -> Result { + // Make sure the first line is doctype html + let first_line = html + .lines() + .into_iter() + .next() + .ok_or_else(|| LemmyError::from_message("No lines in html"))? + .to_lowercase(); + + if !first_line.starts_with("") { + return Err(LemmyError::from_message( + "Site metadata page fetch is not DOCTYPE html", + )); + } + let page = HTML::from_string(html.to_string(), None)?; let page_title = page.title;