Reimagine model & add search method.

Internally no longer rely on parsing HTML but instead parse JSON (using
regex to grep the JSON from HTLM).

Playlist support is commented out / hidden due to the complexity of
implemented it. Methods exposing this functionality can be added later.
pull/4/head
Vilgot Fredenberg 3 years ago
parent 875fda42ff
commit 2d32d70d91
No known key found for this signature in database
GPG Key ID: 7B1BB6C490FC6780

@ -1,7 +1,10 @@
[package] [package]
name = "youtube-metadata" name = "youtube-metadata"
version = "0.2.0" version = "0.2.0"
authors = ["trivernis <trivernis@protonmail.com>"] authors = [
"trivernis <trivernis@protonmail.com>",
"Vilgot Fredenberg <vilgot@fredenberg.xyz>",
]
edition = "2018" edition = "2018"
description = "YouTube video metadata fetcher" description = "YouTube video metadata fetcher"
readme = "README.md" readme = "README.md"
@ -11,9 +14,17 @@ license = "MIT"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
reqwest = "0.11.3"
scraper = "0.12.0"
lazy_static = "1.4.0" lazy_static = "1.4.0"
reqwest = { default-features = false, version = "0.11.3" }
regex = "1"
serde = { features = ["derive"], optional = true, version = "1" }
serde_json = "1"
tracing = "0.1"
[dev-dependencies] [dev-dependencies]
tokio = { version = "1.5.0", features = ["macros", "rt-multi-thread"] } tokio = { features = ["macros", "rt-multi-thread"], version = "1.5.0" }
[features]
default = ["native"]
native = ["reqwest/default-tls"]
rustls = ["reqwest/rustls-tls"]

@ -1,34 +1,211 @@
use crate::error::Result; use crate::{
use crate::parsing::video_information::parse_video_information; error::Error,
use crate::types::VideoInformation; model::{id::VideoId, search::SearchResult, Resource, Video},
parsing::{search::search_information, video_information},
/// Returns information about a video };
/// ```
/// use youtube_metadata::get_video_information; /// Reusable client, [`NotReusable`]s cousin.
/// # #[tokio::test] ///
/// # async fn doctest() { /// Internally wraps around an [`Arc`], so cloning is cheap.
/// let information = get_video_information("https://www.youtube.com/watch?v=dQw4w9WgXcQ") ///
/// .await /// [`Arc`]: std::sync::Arc
/// .unwrap(); #[derive(Clone, Debug, Default)]
/// assert_eq!(information.id, "dQw4w9WgXcQ".to_string()); pub struct Reusable(reqwest::Client);
/// assert_eq!(
/// information.url, impl Reusable {
/// "https://www.youtube.com/watch?v=dQw4w9WgXcQ".to_string() /// Create a new reusable client.
/// ); pub fn new() -> Self {
/// assert_eq!(information.uploader, "RickAstleyVEVO".to_string()); Self(reqwest::Client::new())
/// assert_eq!( }
/// information.title,
/// "Rick Astley - Never Gonna Give You Up (Video)".to_string() // Not implemented
/// ); /*
/// assert_eq!( * /// Get a playlist by its id.
/// information.thumbnail, * pub async fn playlist(&self, playlist: PlaylistId) -> Result<Playlist, Error> {
/// Some("https://i.ytimg.com/vi/dQw4w9WgXcQ/maxresdefault.jpg".to_string()) * todo!()
/// ); * }
/// # } */
/// ```
pub async fn get_video_information(url: &str) -> Result<VideoInformation> { /// Search for some query on youtube
let response = reqwest::get(url).await?; ///
let response_text = response.text().await?; /// # Example
///
parse_video_information(&response_text) /// ```no_run
/// # use youtube_metadata::Reusable;
/// use std::time::Duration;
///
/// # async fn doc() -> Result<(), Box<dyn std::error::Error>> {
/// let reusable = Reusable::new();
/// let first = reusable.search("Rick Astley - Never Gonna Give You Up (Official Music Video)")
/// .await?
/// .videos()
/// .next()
/// .expect("atleast one result");
/// assert_eq!(first.id.as_str(), "dQw4w9WgXcQ");
/// assert_eq!(first.length, Duration::from_secs(213));
/// assert_eq!(first.title,
/// String::from("Rick Astley - Never Gonna Give You Up (Official Music Video)"));
/// assert_eq!(first.uploader.name, "Rick Astley");
/// # Ok(())
/// # }
/// ```
pub async fn search(&self, search: &str) -> Result<SearchResult, Error> {
let request = self
.0
.get("https://youtube.com/results?")
.query(&[("q", search)])
.build()?;
let response_text = self.0.execute(request).await?.text().await?;
search_information(&response_text)
}
/// Get a video by its id.
pub async fn video(&self, video: VideoId) -> Result<Video, Error> {
let url = format!("https://www.youtube.com/watch?v={}", video);
match self.query(&url).await? {
Resource::Video(v) => (Ok(v)),
_ => unreachable!(),
}
}
/// Fetch a resource from a url.
///
/// Will only resolve to [`Resource::Video`] right now due to playlists being unsupported.
///
/// [`Resource`] will currently only contain a video due to playlists being unimplemented.
pub async fn query(&self, query: &str) -> Result<Resource, Error> {
let request = self.0.get(query).build()?;
let response_text = self.0.execute(request).await?.text().await?;
// for now call this since only videos are supported.
Ok(Resource::Video(video_information(&response_text)?))
}
}
/// Zero sized associated function holder, [`Reusable`]s cousin.
///
/// Creates a new client on each invocation.
#[derive(Debug)]
pub struct NotReusable;
impl NotReusable {
// Not implemented
/*
* /// Get a playlist by its id.
* pub async fn playlist(playlist: PlaylistId) -> Result<Playlist, Error> {
* todo!()
* }
*/
/// Search for some query on youtube
///
/// # Example
///
/// ```no_run
/// # use youtube_metadata::NotReusable;
/// #
/// use std::time::Duration;
///
/// # async fn doc() -> Result<(), Box<dyn std::error::Error>> {
/// let first = NotReusable::search("Rick Astley - Never Gonna Give You Up (Official Music Video)")
/// .await?
/// .videos()
/// .next()
/// .expect("atleast one result");
/// assert_eq!(first.id.as_str(), "dQw4w9WgXcQ");
/// assert_eq!(first.length, Duration::from_secs(213));
/// assert_eq!(first.title,
/// String::from("Rick Astley - Never Gonna Give You Up (Official Music Video)"));
/// assert_eq!(first.uploader.name, "Rick Astley");
/// # Ok(())
/// # }
/// ```
pub async fn search(search: &str) -> Result<SearchResult, Error> {
let client = reqwest::Client::new();
let request = client
.get("https://youtube.com/results?")
.query(&[("q", search)])
.build()?;
let response_text = client.execute(request).await?.text().await?;
search_information(&response_text)
}
/// Get a video by its id.
pub async fn video(video: VideoId) -> Result<Video, Error> {
let url = format!("https://www.youtube.com/watch?v={}", video);
match Self::query(&url).await? {
Resource::Video(v) => (Ok(v)),
_ => unreachable!(),
}
}
/// Fetch a resource from a url.
///
/// Will only resolve to [`Resource::Video`] right now due to playlists being unsupported.
///
/// [`Resource`] will currently only contain a video due to playlists being unimplemented.
pub async fn query(query: &str) -> Result<Resource, Error> {
let client = reqwest::Client::new();
let request = client.get(query).build()?;
let response_text = client.execute(request).await?.text().await?;
// for now call this since only videos are supported.
Ok(Resource::Video(video_information(&response_text)?))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::Duration;
#[tokio::test]
async fn rickroll() -> Result<(), Box<dyn std::error::Error>> {
let search =
NotReusable::search("Rick Astley - Never Gonna Give You Up (Official Music Video)")
.await?
.videos()
.next()
.expect("atleast one result");
let video = NotReusable::video(VideoId::new("dQw4w9WgXcQ")).await?;
assert_eq!(search.id.as_str(), "dQw4w9WgXcQ");
assert_eq!(video.id.as_str(), "dQw4w9WgXcQ");
assert_eq!(search.length, Duration::from_secs(213));
assert_eq!(video.length, Duration::from_millis(212091));
assert_eq!(
search.title.as_str(),
"Rick Astley - Never Gonna Give You Up (Official Music Video)"
);
assert_eq!(
video.title.as_str(),
"Rick Astley - Never Gonna Give You Up (Official Music Video)"
);
assert_eq!(search.uploader.name, "Rick Astley");
assert_eq!(video.uploader.name, "Rick Astley");
Ok(())
}
#[tokio::test]
async fn live() -> Result<(), Box<dyn std::error::Error>> {
NotReusable::search("live music").await?;
Ok(())
}
#[tokio::test]
async fn playlist() -> Result<(), Box<dyn std::error::Error>> {
NotReusable::search("music playlist").await?;
Ok(())
}
} }

@ -1,3 +1,6 @@
//! Error types of this library.
//!
//! Note that parsing should never fail and is indicative of an interal error.
use std::{ use std::{
error::Error as StdError, error::Error as StdError,
fmt::{Display, Formatter, Result as FmtResult}, fmt::{Display, Formatter, Result as FmtResult},
@ -5,12 +8,39 @@ use std::{
use reqwest::Error as ReqwestError; use reqwest::Error as ReqwestError;
pub type Result<T> = std::result::Result<T, Error>; #[derive(Debug)]
#[doc(hidden)]
pub struct ParseError {
pub(crate) kind: ParseErrorKind,
}
impl Display for ParseError {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
use ParseErrorKind::*;
match self.kind {
Other => f.write_str("something failed to parse"),
Regex => f.write_str("fetching json using regex failed"),
}
}
}
impl StdError for ParseError {}
#[derive(Debug)]
pub(crate) enum ParseErrorKind {
Other,
Regex,
}
/// Error types.
#[derive(Debug)] #[derive(Debug)]
pub enum Error { pub enum Error {
/// Error doing http.
Reqwest(ReqwestError), Reqwest(ReqwestError),
Parse(Parsing), /// Internal parsing error.
/// Hitting this should never happen and is a bug.
Parse(ParseError),
} }
impl Display for Error { impl Display for Error {
@ -18,7 +48,7 @@ impl Display for Error {
use Error::*; use Error::*;
match self { match self {
Reqwest(e) => e.fmt(f), Reqwest(e) => e.fmt(f),
Parse(_) => write!(f, "parse error"), Parse(_) => write!(f, "json parsing error"),
} }
} }
} }
@ -33,32 +63,14 @@ impl StdError for Error {
} }
} }
impl From<Parsing> for Error {
fn from(s: Parsing) -> Self {
Self::Parse(s)
}
}
impl From<ReqwestError> for Error { impl From<ReqwestError> for Error {
fn from(e: ReqwestError) -> Self { fn from(e: ReqwestError) -> Self {
Self::Reqwest(e) Self::Reqwest(e)
} }
} }
#[derive(Debug)] impl From<ParseError> for Error {
pub enum Parsing { fn from(e: ParseError) -> Self {
MissingElement(String), Self::Parse(e)
MissingAttribute(String),
}
impl Display for Parsing {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
use Parsing::*;
match self {
MissingAttribute(s) => write!(f, "missing attribute: {}", s),
MissingElement(s) => write!(f, "missing element: {}", s),
}
} }
} }
impl StdError for Parsing {}

@ -1,9 +1,15 @@
pub(crate) mod endpoints; //! Library that searches youtube and parses the result to [`model`].
//!
//! [`Reusable`] reuses the same http client on each `GET` request.
//! This takes advantage of keep-alive connections.
#![deny(clippy::inconsistent_struct_constructor)]
#![deny(missing_docs)]
#![deny(missing_debug_implementations)]
#![deny(rustdoc::broken_intra_doc_links)]
mod endpoints;
pub mod error; pub mod error;
pub mod model;
pub(crate) mod parsing; pub(crate) mod parsing;
pub(crate) mod types;
pub use endpoints::get_video_information;
#[cfg(test)] pub use endpoints::{NotReusable, Reusable};
mod tests;

@ -0,0 +1,98 @@
//! Mapping of output.
//!
//! Use the resource's id's to get thumbnails or urls.
use std::time::Duration;
use id::{PlaylistId, VideoId};
use search::{PartialPlaylist, PartialPlaylistVideo};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use self::id::ChannelId;
pub mod id;
pub mod search;
pub mod thumbnail;
/// Information about a channel.
///
/// Note that this is *not* a user so its [`Channel::id`] is of the form of `/channel/ID`, not
/// `/user/ID`.
/// The link still resolves to the same page, so this should not be an issue in most cases.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Channel {
/// The channel's unique Id.
pub id: ChannelId,
/// The channel's name.
pub name: String,
}
/// Information about a playlist.
// Hide since not implemented.
#[doc(hidden)]
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Playlist {
/// The playlist's unique Id.
pub id: PlaylistId,
/// The playlist's tracks.
pub tracks: Vec<Video>,
/// The playlist's title.
pub title: String,
/// The playlists's uploader.
pub uploader: Channel,
}
impl From<Playlist> for PartialPlaylist {
fn from(mut p: Playlist) -> Self {
// partial playlist only contains up to the first two videos
let tracks_total = p.tracks.len() as u32;
p.tracks.truncate(2);
let tracks = p.tracks.into_iter().map(Into::into).collect();
Self {
id: p.id,
tracks,
tracks_total,
title: p.title,
uploader: p.uploader,
}
}
}
/// Resource types.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum Resource {
/// Playlist type.
///
/// Note that this variant is never constructed (not implemented).
Playlist(Playlist),
/// Video type.
Video(Video),
}
/// Information about a video.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Video {
/// The video's unique Id.
pub id: VideoId,
/// The video's length.
pub length: Duration,
/// The video's title.
pub title: String,
/// The video's uploader.
pub uploader: Channel,
}
impl From<Video> for PartialPlaylistVideo {
fn from(v: Video) -> Self {
Self {
id: v.id,
length: v.length,
title: v.title,
}
}
}

@ -0,0 +1,121 @@
//! Type-safe resource identifiers.
//!
//! Note that a `thumbnail` method is unavailable for [`PlaylistId`], this is due to playlist's
//! using their first video's thumbnail.
use std::fmt;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use super::thumbnail;
/// Channel identifier.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct ChannelId(String);
impl fmt::Display for ChannelId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)
}
}
impl ChannelId {
pub(crate) fn new(id: &str) -> Self {
Self(id.to_owned())
}
/// Yields the underyling string slice.
pub fn as_str(&self) -> &str {
&self.0
}
/// Consume the id, returning the underlying string.
pub fn into_string(self) -> String {
self.0
}
/// Get the channel url
pub fn url(&self) -> String {
format!("https://www.youtube.com/channel/{}", self)
}
}
/// Playlist identifier.
///
/// Use [`PlaylistId::url`] to get the playlist url.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct PlaylistId(String);
impl fmt::Display for PlaylistId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)
}
}
impl PlaylistId {
pub(crate) fn new(id: &str) -> Self {
Self(id.to_owned())
}
/// Yields the underyling string slice.
pub fn as_str(&self) -> &str {
&self.0
}
/// Consume the id, returning the underlying string.
pub fn into_string(self) -> String {
self.0
}
/// Get the playlist url
pub fn url(&self) -> String {
format!("https://www.youtube.com/playlist?list={}", self)
}
}
/// Video identifier.
// TODO: feature flag for staticvec (allows `Copy`) (requires nightly)
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct VideoId(String);
impl fmt::Display for VideoId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.0)
}
}
impl VideoId {
pub(crate) fn new(id: &str) -> Self {
Self(id.to_owned())
}
/// Yields the underyling string slice.
pub fn as_str(&self) -> &str {
&self.0
}
/// Consume the id, returning the underlying string.
pub fn into_string(self) -> String {
self.0
}
/// Get the thumbnail url.
pub fn thumbnail(&self, format: thumbnail::ImageFormat, res: thumbnail::Resolution) -> String {
match format {
thumbnail::ImageFormat::JPEG => {
format!("{}/vi/{}/{}.jpg", thumbnail::URL, self, res)
}
thumbnail::ImageFormat::WebP => {
format!("{}/vi_webp/{}/{}.webp", thumbnail::URL, self, res)
}
}
}
/// Get the video url.
pub fn url(&self) -> String {
format!("https://youtu.be/{}", self)
}
}

@ -0,0 +1,104 @@
//! Search related models.
use std::time::Duration;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use super::{
id::{PlaylistId, VideoId},
Channel, Video,
};
/// Search result contents.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SearchResult {
/// List of search result items.
pub items: Vec<SearchItem>,
}
impl SearchResult {
/// Filters the results to an [`Iterator`] over [`Video`]s.
pub fn videos(self) -> impl Iterator<Item = Video> {
self.items.into_iter().filter_map(|item| match item {
SearchItem::Playlist(_) => None,
SearchItem::Video(v) => Some(v),
})
}
/// Filters the results to an [`Iterator`] over [`PartialPlaylist`]s.
pub fn playlists(self) -> impl Iterator<Item = PartialPlaylist> {
self.items.into_iter().filter_map(|item| match item {
SearchItem::Playlist(p) => Some(p),
SearchItem::Video(_) => None,
})
}
}
/// Contains the possible item for a search.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum SearchItem {
/// Playlist item.
///
/// Note that playlists don't contain a [`Vec<Video>`] but rather a [`Vec<PartialPlaylistVideo>`]
/// (which is missing a [`Channel`] field).
///
/// This [`Vec`] also only contains the first two items.
/// Thus to get the full list of videos in a playlist another query has to be used using the
/// [`PlaylistId`] (note that this is not implemented yet for this library).
///
/// [`PlaylistId`]: super::PlaylistId
Playlist(PartialPlaylist),
/// Video item.
Video(Video),
}
impl SearchItem {
/// Returns an immutable reference to the name of the inner item.
pub fn title(&self) -> &str {
match self {
SearchItem::Playlist(p) => &p.title,
SearchItem::Video(v) => &v.title,
}
}
/// Returns an immutable reference to the uploader of the innner item.
pub fn uploader(&self) -> &Channel {
match self {
SearchItem::Playlist(p) => &p.uploader,
SearchItem::Video(v) => &v.uploader,
}
}
}
/// Information about a partial playlist.
///
/// This struct is returned from searches.
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct PartialPlaylist {
/// The playlist's unique Id.
pub id: PlaylistId,
/// Up to the first two tracks of the playlist.
pub tracks: Vec<PartialPlaylistVideo>,
/// The total number of tracks in the playlist.
pub tracks_total: u32,
/// The playlist's title.
pub title: String,
/// The playlist's uploader.
pub uploader: Channel,
}
/// Information about a video in a [`PartialPlaylist`].
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct PartialPlaylistVideo {
/// The video's unique Id.
pub id: VideoId,
/// The video's length.
pub length: Duration,
/// The video's title.
pub title: String,
}

@ -0,0 +1,65 @@
//! Thumbnail configuration types.
//!
//! Invalid thumbnails resolve to this image:
//!
//! ![](https://i.ytimg.com)
//!
//! See [YouTube's API docs][docs] for more info.
//!
//! [docs]: https://developers.google.com/youtube/v3/docs/thumbnails
//! [`VideoId`]: super::id::VideoId
use std::fmt;
/// Base url of thumbnails.
pub(crate) const URL: &str = "https://i.ytimg.com";
/// YouTube's supported image formats.
///
/// [`ImageFormat::WebP`] retains the same *or better* quality at a smaller size.
#[derive(Clone, Copy, Debug)]
#[non_exhaustive]
pub enum ImageFormat {
/// Older, better supported format.
JPEG,
/// Newer (2010) and more efficient format.
WebP,
}
/// YouTube's resolution types.
///
/// Query the YouTube API to know if [`Resolution::Maxres`] or [`Resolution::Standard`] are
/// available for a resource.
///
/// The resolution varies on which resource the thumbnail is for, video resolution's are currently
/// documented.
#[derive(Clone, Copy, Debug)]
pub enum Resolution {
/// 120 x 90px
Default,
/// 480 x 360px
High,
/// 1280 x 720px
///
/// Not available for all resources.
Maxres,
/// 320 x 180px
Medium,
/// 640 x 480px
///
/// Not available for all resources.
Standard,
}
impl fmt::Display for Resolution {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Resolution::Default => (),
Resolution::High => f.write_str("hq")?,
Resolution::Maxres => f.write_str("maxres")?,
Resolution::Medium => f.write_str("mq")?,
Resolution::Standard => f.write_str("sd")?,
}
f.write_str("default")
}
}

@ -1,25 +1,155 @@
use crate::error::{Parsing, Result}; use std::time::Duration;
use scraper::{ElementRef, Html, Selector};
pub mod video_information; use lazy_static::lazy_static;
use regex::Regex;
use serde_json::Value;
/// Tries selecting one element or fails if the element can't be found use crate::{
fn try_select_one<'a>(document: &'a Html, selector: &Selector) -> Result<ElementRef<'a>> { error::{Error, ParseError, ParseErrorKind},
document model::{
.select(selector) id::{ChannelId, VideoId},
.next() Channel, Video,
.ok_or_else(|| Parsing::MissingElement(format!("{:?}", selector)).into()) },
};
#[macro_use]
macro_rules! other {
() => {
ParseError {
kind: ParseErrorKind::Other,
}
};
}
pub(crate) mod search;
fn yt_initial_data(html: &str) -> Result<Value, Error> {
lazy_static! {
// FIXME: improve against accidental termination
static ref RE: Regex = Regex::new(r"var ytInitialData = (.*?);</script>").expect("valid regex");
}
serde_json::from_str::<Value>(
RE.captures(html)
.and_then(|c| c.get(1))
.map(|m| m.as_str())
.ok_or(ParseError {
kind: ParseErrorKind::Regex,
})?,
)
.map_err(|_| {
ParseError {
kind: ParseErrorKind::Regex,
}
.into()
})
}
fn yt_initial_player_response(html: &str) -> Result<Value, Error> {
lazy_static! {
// FIXME: improve against accidental termination
static ref RE: Regex = Regex::new(r"var ytInitialPlayerResponse = (.*);</script>").expect("valid regex");
}
serde_json::from_str::<Value>(
RE.captures(html)
.and_then(|c| c.get(1))
.map(|m| m.as_str())
.ok_or(ParseError {
kind: ParseErrorKind::Regex,
})?,
)
.map_err(|_| {
ParseError {
kind: ParseErrorKind::Regex,
}
.into()
})
}
pub(crate) fn video_information(html: &str) -> Result<Video, Error> {
let data = yt_initial_data(html)?;
let player = yt_initial_player_response(html)?;
let items = data
.pointer("/contents/twoColumnWatchNextResults/results/results/contents")
.ok_or(other!())?;
let id = VideoId::new(
data.pointer("/currentVideoEndpoint/watchEndpoint/videoId")
.and_then(Value::as_str)
.ok_or(other!())?,
);
let length = player
.pointer("/streamingData/formats/0/approxDurationMs")
.and_then(Value::as_str)
.and_then(|ms| ms.parse::<u64>().ok())
.map(Duration::from_millis)
.ok_or(other!())?;
let title = items
.pointer("/0/videoPrimaryInfoRenderer/title/runs/0/text")
.and_then(Value::as_str)
.ok_or(other!())?
.to_owned();
let uploader = items
.pointer("/1/videoSecondaryInfoRenderer/owner/videoOwnerRenderer/title/runs/0")
.ok_or(other!())?;
let uploader = Channel {
id: ChannelId::new(
uploader
.pointer("/navigationEndpoint/browseEndpoint/browseId")
.and_then(Value::as_str)
.ok_or(other!())?,
),
name: uploader
.get("text")
.and_then(Value::as_str)
.ok_or(other!())?
.to_owned(),
};
Ok(Video {
id,
length,
title,
uploader,
})
}
fn video_is_live(video: &Value) -> bool {
let badges = video.get("badges").and_then(Value::as_array);
badges
.map(|badges| {
badges.iter().any(|badge| {
badge
.pointer("/metadataBadgeRenderer/style")
.and_then(Value::as_str)
.eq(&Some("BADGE_STYLE_TYPE_LIVE_NOW"))
})
})
.unwrap_or_default()
} }
/// Tries to select a given attribute fn length_to_dur(input: &str) -> Duration {
fn try_select_attribute<'a>( fn time_multiplier(i: usize) -> u64 {
document: &'a Html, match i {
selector: &Selector, 0 => 1,
attribute: &str, 1 => 60,
) -> Result<&'a str> { 2 => 3600,
let element = try_select_one(document, selector)?; _ => unreachable!("YouTube duration's aren't counted in days"),
element }
.value() }
.attr(attribute) let mut duration = Duration::ZERO;
.ok_or_else(|| Parsing::MissingAttribute(attribute.to_string()).into()) for (i, time) in input
.split(':')
.map(|s| s.parse::<u64>().expect("is str encoded integer"))
.rev()
.enumerate()
{
duration += Duration::from_secs(time * time_multiplier(i))
}
duration
} }

@ -0,0 +1,188 @@
use serde_json::Value;
use tracing::{event, instrument, Level};
use crate::{
error::{Error, ParseError, ParseErrorKind},
model::{
id::{ChannelId, PlaylistId, VideoId},
search::{PartialPlaylist, PartialPlaylistVideo, SearchItem, SearchResult},
Channel, Video,
},
};
use super::{length_to_dur, video_is_live, yt_initial_data};
pub(crate) fn search_information(html: &str) -> Result<SearchResult, Error> {
let data = yt_initial_data(html)?;
let items = data
.pointer("/contents/twoColumnSearchResultsRenderer/primaryContents/sectionListRenderer/contents/0/itemSectionRenderer/contents")
.and_then(Value::as_array)
.ok_or(other!())?;
let mut things = Vec::with_capacity(items.len());
for item in items {
if let Some(video) = item.get("videoRenderer") {
let video = match parse_video(video)? {
Some(v) => SearchItem::Video(v),
None => continue,
};
things.push(video);
} else if let Some(playlist) = item.get("playlistRenderer") {
let playlist = match parse_playlist(playlist)? {
Some(p) => SearchItem::Playlist(p),
None => continue,
};
things.push(playlist);
} else if let Some(_shelf) = item.get("shelfRenderer") {
//println!("{:?}", shelf.pointer("/title/simpleText"));
// TODO: "shelfRender" & "radioRender"
} else {
// TODO: "radioRender?"
}
}
Ok(SearchResult { items: things })
}
#[instrument(skip(video), fields(id))]
fn parse_video(video: &Value) -> Result<Option<Video>, Error> {
let id = VideoId::new(
video
.get("videoId")
.and_then(Value::as_str)
.ok_or(other!())?,
);
tracing::Span::current().record("id", &id.as_str());
// skip live (for now)
if video_is_live(video) {
event!(Level::TRACE, "skipping live stream");
return Ok(None);
}
let length = match video
.pointer("/lengthText/simpleText")
.and_then(Value::as_str)
{
Some(l) => length_to_dur(l),
// Live badge is sometimes missing for no reason
None => {
event!(
Level::DEBUG,
"video without length (livestream?) found, skipping..."
);
return Ok(None);
}
};
let title = video
.pointer("/title/runs/0/text")
.and_then(Value::as_str)
.ok_or(other!())?
.to_owned();
let uploader = video.pointer("/ownerText/runs/0").ok_or(other!())?;
let uploader = Channel {
id: ChannelId::new(
uploader
.pointer("/navigationEndpoint/browseEndpoint/browseId")
.and_then(Value::as_str)
.ok_or(other!())?,
),
name: uploader
.get("text")
.and_then(Value::as_str)
.ok_or(other!())?
.to_owned(),
};
Ok(Some(Video {
id,
length,
title,
uploader,
}))
}
#[instrument(skip(playlist), fields(id))]
fn parse_playlist(playlist: &Value) -> Result<Option<PartialPlaylist>, Error> {
let id = PlaylistId::new(
playlist
.get("playlistId")
.and_then(Value::as_str)
.ok_or(other!())?,
);
tracing::Span::current().record("id", &id.as_str());
let items = playlist
.get("videos")
.and_then(Value::as_array)
.ok_or(other!())?;
let mut tracks = Vec::with_capacity(items.len());
for track in items {
let video = track.get("childVideoRenderer").ok_or(other!())?;
let id = VideoId::new(
video
.get("videoId")
.and_then(Value::as_str)
.ok_or(other!())?,
);
let length = length_to_dur(
video
.pointer("/lengthText/simpleText")
.and_then(Value::as_str)
.ok_or(other!())?,
);
let title = video
.pointer("/title/simpleText")
.and_then(Value::as_str)
.unwrap()
.to_owned();
let obj = PartialPlaylistVideo { id, length, title };
tracks.push(obj);
}
let tracks_total = playlist
.pointer("/videoCountText/runs/0/text")
.and_then(Value::as_str)
.ok_or(other!())?
.parse::<u32>()
.map_err(|_| other!())?;
let title = playlist
.pointer("/title/simpleText")
.and_then(Value::as_str)
.ok_or(other!())?
.to_owned();
let uploader = playlist
.pointer("/shortBylineText/runs/0")
.ok_or(other!())?;
let uploader = Channel {
id: ChannelId::new(
uploader
.pointer("/navigationEndpoint/browseEndpoint/browseId")
.and_then(Value::as_str)
.ok_or(other!())?,
),
name: uploader
.get("text")
.and_then(Value::as_str)
.ok_or(other!())?
.to_owned(),
};
Ok(Some(PartialPlaylist {
id,
tracks,
tracks_total,
title,
uploader,
}))
}

@ -1,32 +0,0 @@
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use super::try_select_attribute;
use crate::{error::Result, types::VideoInformation};
lazy_static! {
static ref TITLE_SELECTOR: Selector = Selector::parse(r#"meta[property="og:title"]"#).unwrap();
static ref THUMBNAIL_SELECTOR: Selector = Selector::parse(r#"meta[property="og:image"]"#).unwrap();
static ref URL_SELECTOR: Selector = Selector::parse(r#"link[rel="canonical"]"#).unwrap();
static ref CHANNEL_SELECTOR: Selector = Selector::parse(r#"link[itemprop="name"]"#).unwrap();
static ref ID_SELECTOR: Selector = Selector::parse(r#"meta[itemprop="videoId"]"#).unwrap();
}
/// Parses information about a video from the html
pub fn parse_video_information(html: &str) -> Result<VideoInformation> {
let document = Html::parse_document(html);
let video_id = try_select_attribute(&document, &ID_SELECTOR, "content")?;
let url = try_select_attribute(&document, &URL_SELECTOR, "href")?;
let author = try_select_attribute(&document, &CHANNEL_SELECTOR, "content")?;
let title = try_select_attribute(&document, &TITLE_SELECTOR, "content")?;
let thumbnail = try_select_attribute(&document, &THUMBNAIL_SELECTOR, "content").ok();
Ok(VideoInformation {
id: video_id.to_string(),
url: url.to_string(),
title: title.to_string(),
uploader: author.to_string(),
thumbnail: thumbnail.map(|s| s.to_string()),
})
}

@ -1 +0,0 @@
mod endpoints;

@ -1,10 +0,0 @@
use crate::get_video_information;
#[tokio::test]
async fn invalid_url_is_err() {
assert!(
get_video_information("https://www.youtube.com/watch?v=FFFFFFFFFFF")
.await
.is_err()
);
}

@ -1,8 +0,0 @@
#[derive(Clone, Debug)]
pub struct VideoInformation {
pub id: String,
pub url: String,
pub title: String,
pub uploader: String,
pub thumbnail: Option<String>,
}
Loading…
Cancel
Save