Initial commit
This commit is contained in:
311
src/lib.rs
Normal file
311
src/lib.rs
Normal file
@@ -0,0 +1,311 @@
|
||||
use mediawiki_rest_api::prelude::*;
|
||||
use mediawiki_rest_api::rest_api_builder::RestApiBuilder;
|
||||
use parse_wiki_text::{Configuration, Node, Output};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Fetches the raw wikitext of a Wikipedia article.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `language` - The language code (e.g., "en", "fr").
|
||||
/// * `title` - The title of the article.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The raw wikitext content of the article.
|
||||
pub async fn fetch_article(language: &str, title: &str) -> Result<String, String> {
|
||||
let api = RestApiBuilder::wikipedia(language).build();
|
||||
let page = Page::new(title);
|
||||
match page.get(&api, false).await {
|
||||
Ok((_, wikitext)) => Ok(wikitext),
|
||||
Err(e) => Err(format!("Failed to fetch article: {}", e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses the wikitext into a structured format.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `text` - The raw wikitext.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The parsed output containing nodes.
|
||||
pub fn parse(text: &str) -> Output {
|
||||
let config = Configuration::default();
|
||||
config.parse(text)
|
||||
}
|
||||
|
||||
/// Extracts all interwiki links from the parsed nodes.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `nodes` - The list of nodes to traverse.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A list of link targets (e.g., "Page Title").
|
||||
pub fn get_links(nodes: &[Node]) -> Vec<String> {
|
||||
let mut links = Vec::new();
|
||||
visit_nodes(nodes, &mut |node| {
|
||||
if let Node::Link { target, .. } = node {
|
||||
links.push(target.to_string());
|
||||
}
|
||||
});
|
||||
links
|
||||
}
|
||||
|
||||
/// Extracts all categories from the parsed nodes.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `nodes` - The list of nodes to traverse.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A list of category targets (e.g., "Category:Rust").
|
||||
pub fn get_categories(nodes: &[Node]) -> Vec<String> {
|
||||
let mut categories = Vec::new();
|
||||
visit_nodes(nodes, &mut |node| {
|
||||
if let Node::Category { target, .. } = node {
|
||||
categories.push(target.to_string());
|
||||
}
|
||||
});
|
||||
categories
|
||||
}
|
||||
|
||||
/// Extracts all references from the parsed nodes.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `nodes` - The list of nodes to traverse.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A list of reference contents (e.g., the text inside <ref> tags).
|
||||
pub fn get_references(nodes: &[Node]) -> Vec<String> {
|
||||
let mut references = Vec::new();
|
||||
visit_nodes(nodes, &mut |node| {
|
||||
if let Node::Tag { name, nodes, .. } = node {
|
||||
if name == "ref" {
|
||||
let content = get_text_from_nodes(nodes);
|
||||
if !content.is_empty() {
|
||||
references.push(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
references
|
||||
}
|
||||
|
||||
/// Extracts all templates from the parsed nodes.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `nodes` - The list of nodes to traverse.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A list of template names (e.g., "Infobox").
|
||||
pub fn get_templates(nodes: &[Node]) -> Vec<String> {
|
||||
let mut templates = Vec::new();
|
||||
visit_nodes(nodes, &mut |node| {
|
||||
if let Node::Template { name, .. } = node {
|
||||
let template_name = get_text_from_nodes(name).trim().to_string();
|
||||
if !template_name.is_empty() {
|
||||
templates.push(template_name);
|
||||
}
|
||||
}
|
||||
});
|
||||
templates
|
||||
}
|
||||
|
||||
/// Extracts parameters from a specific template.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `nodes` - The list of nodes to traverse.
|
||||
/// * `template_name` - The name of the template to find.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A map of parameter names to their values.
|
||||
pub fn get_template_parameters(nodes: &[Node], template_name: &str) -> HashMap<String, String> {
|
||||
let mut parameters = HashMap::new();
|
||||
visit_nodes(nodes, &mut |node| {
|
||||
if let Node::Template { name, parameters: params, .. } = node {
|
||||
let current_template_name = get_text_from_nodes(name).trim().to_string();
|
||||
if current_template_name.eq_ignore_ascii_case(template_name) {
|
||||
for param in params {
|
||||
let key = if let Some(name) = ¶m.name {
|
||||
get_text_from_nodes(name).trim().to_string()
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let value = get_text_from_nodes(¶m.value).trim().to_string();
|
||||
parameters.insert(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
parameters
|
||||
}
|
||||
|
||||
fn visit_nodes<F>(nodes: &[Node], callback: &mut F)
|
||||
where
|
||||
F: FnMut(&Node),
|
||||
{
|
||||
for node in nodes {
|
||||
callback(node);
|
||||
match node {
|
||||
Node::Heading { nodes, .. } => visit_nodes(nodes, callback),
|
||||
Node::Link { text, .. } => visit_nodes(text, callback),
|
||||
Node::ExternalLink { nodes, .. } => visit_nodes(nodes, callback),
|
||||
Node::Preformatted { nodes, .. } => visit_nodes(nodes, callback),
|
||||
Node::Tag { nodes, .. } => visit_nodes(nodes, callback),
|
||||
Node::Image { text, .. } => visit_nodes(text, callback),
|
||||
Node::UnorderedList { items, .. } => {
|
||||
for item in items {
|
||||
visit_nodes(&item.nodes, callback);
|
||||
}
|
||||
}
|
||||
Node::OrderedList { items, .. } => {
|
||||
for item in items {
|
||||
visit_nodes(&item.nodes, callback);
|
||||
}
|
||||
}
|
||||
Node::DefinitionList { items, .. } => {
|
||||
for item in items {
|
||||
visit_nodes(&item.nodes, callback);
|
||||
}
|
||||
}
|
||||
Node::Template { name, parameters, .. } => {
|
||||
visit_nodes(name, callback);
|
||||
for param in parameters {
|
||||
if let Some(name) = ¶m.name {
|
||||
visit_nodes(name, callback);
|
||||
}
|
||||
visit_nodes(¶m.value, callback);
|
||||
}
|
||||
}
|
||||
Node::Parameter { name, default, .. } => {
|
||||
visit_nodes(name, callback);
|
||||
if let Some(default_val) = default {
|
||||
visit_nodes(default_val, callback);
|
||||
}
|
||||
}
|
||||
Node::Table { rows, captions, .. } => {
|
||||
for caption in captions {
|
||||
visit_nodes(&caption.content, callback);
|
||||
}
|
||||
for row in rows {
|
||||
for cell in &row.cells {
|
||||
visit_nodes(&cell.content, callback);
|
||||
}
|
||||
}
|
||||
}
|
||||
Node::Category { ordinal, .. } => {
|
||||
visit_nodes(ordinal, callback);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_text_from_nodes(nodes: &[Node]) -> String {
|
||||
let mut text = String::new();
|
||||
for node in nodes {
|
||||
match node {
|
||||
Node::Text { value, .. } => text.push_str(value),
|
||||
Node::Link { text: link_text, .. } => text.push_str(&get_text_from_nodes(link_text)),
|
||||
Node::ExternalLink { nodes, .. } => text.push_str(&get_text_from_nodes(nodes)),
|
||||
Node::Bold { .. } => {},
|
||||
Node::Italic { .. } => {},
|
||||
Node::BoldItalic { .. } => {},
|
||||
Node::CharacterEntity { character, .. } => text.push(*character),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
text
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_article_valid() {
|
||||
let result = fetch_article("en", "Rust (programming language)").await;
|
||||
assert!(result.is_ok());
|
||||
let content = result.unwrap();
|
||||
assert!(!content.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_fetch_article_invalid() {
|
||||
let result = fetch_article("en", "ThisPageDoesNotExist_12345_XYZ").await;
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse() {
|
||||
let text = "== Heading ==";
|
||||
let output = parse(text);
|
||||
assert!(!output.nodes.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_links() {
|
||||
let text = "Here is a [[Link]] and another [[Another Link|with text]].";
|
||||
let output = parse(text);
|
||||
let links = get_links(&output.nodes);
|
||||
assert_eq!(links, vec!["Link", "Another Link"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_links_nested() {
|
||||
let text = "* [[List Link]]\n* Item with [[Nested Link]]";
|
||||
let output = parse(text);
|
||||
let links = get_links(&output.nodes);
|
||||
assert!(links.contains(&"List Link".to_string()));
|
||||
assert!(links.contains(&"Nested Link".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_categories() {
|
||||
let text = "Some text. [[Category:Programming languages]] [[Category:Rust]]";
|
||||
let output = parse(text);
|
||||
let categories = get_categories(&output.nodes);
|
||||
assert!(categories.contains(&"Category:Programming languages".to_string()));
|
||||
assert!(categories.contains(&"Category:Rust".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_references() {
|
||||
let text = "Some statement.<ref>Source 1</ref> Another statement.<ref>Source 2</ref>";
|
||||
let output = parse(text);
|
||||
let references = get_references(&output.nodes);
|
||||
assert_eq!(references.len(), 2);
|
||||
assert!(references.contains(&"Source 1".to_string()));
|
||||
assert!(references.contains(&"Source 2".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_templates() {
|
||||
let text = "{{Infobox person\n| name = Example\n}}\nSome text. {{Another template}}";
|
||||
let output = parse(text);
|
||||
let templates = get_templates(&output.nodes);
|
||||
assert_eq!(templates.len(), 2);
|
||||
assert!(templates.contains(&"Infobox person".to_string()));
|
||||
assert!(templates.contains(&"Another template".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_template_parameters() {
|
||||
let text = "{{Infobox person\n| name = Example Name\n| age = 30\n}}";
|
||||
let output = parse(text);
|
||||
let params = get_template_parameters(&output.nodes, "Infobox person");
|
||||
assert_eq!(params.get("name"), Some(&"Example Name".to_string()));
|
||||
assert_eq!(params.get("age"), Some(&"30".to_string()));
|
||||
}
|
||||
}
|
||||
181
src/main.rs
Normal file
181
src/main.rs
Normal file
@@ -0,0 +1,181 @@
|
||||
use Brig::{fetch_article, parse, get_links, get_categories, get_references, get_templates, get_template_parameters};
|
||||
use parse_wiki_text::Node;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
// Example usage of the library
|
||||
let language = "en";
|
||||
let title = "Rust (programming language)";
|
||||
|
||||
println!("Fetching article '{}' from {} Wikipedia...", title, language);
|
||||
|
||||
match fetch_article(language, title).await {
|
||||
Ok(wikitext) => {
|
||||
println!("Successfully fetched article. Length: {}", wikitext.len());
|
||||
|
||||
let output = parse(&wikitext);
|
||||
println!("Parsed article. Nodes: {}", output.nodes.len());
|
||||
|
||||
let links = get_links(&output.nodes);
|
||||
println!("Found {} links:", links.len());
|
||||
for (i, link) in links.iter().take(10).enumerate() {
|
||||
println!(" {}. {}", i + 1, link);
|
||||
}
|
||||
if links.len() > 10 {
|
||||
println!(" ... and {} more", links.len() - 10);
|
||||
}
|
||||
|
||||
let categories = get_categories(&output.nodes);
|
||||
println!("Found {} categories:", categories.len());
|
||||
for (i, category) in categories.iter().enumerate() {
|
||||
println!(" {}. {}", i + 1, category);
|
||||
}
|
||||
|
||||
let references = get_references(&output.nodes);
|
||||
println!("Found {} references:", references.len());
|
||||
for (i, reference) in references.iter().take(5).enumerate() {
|
||||
println!(" {}. {}", i + 1, reference);
|
||||
}
|
||||
if references.len() > 5 {
|
||||
println!(" ... and {} more", references.len() - 5);
|
||||
}
|
||||
|
||||
let templates = get_templates(&output.nodes);
|
||||
println!("Found {} templates:", templates.len());
|
||||
for (i, template) in templates.iter().take(10).enumerate() {
|
||||
println!(" {}. {}", i + 1, template);
|
||||
}
|
||||
if templates.len() > 10 {
|
||||
println!(" ... and {} more", templates.len() - 10);
|
||||
}
|
||||
|
||||
let template_name_to_find = "Infobox programming language";
|
||||
let template_params = get_template_parameters(&output.nodes, template_name_to_find);
|
||||
println!("Found {} parameters for template '{}':", template_params.len(), template_name_to_find);
|
||||
for (key, value) in template_params.iter().take(50) {
|
||||
println!(" - {}: {}", key, value);
|
||||
}
|
||||
if template_params.len() > 50 {
|
||||
println!(" ... and {} more", template_params.len() - 10);
|
||||
}
|
||||
|
||||
// Visit and print nodes (demonstration)
|
||||
// visit(&output.nodes, 0);
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Error: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn visit(nodes: &[Node], depth: usize) {
|
||||
for node in nodes {
|
||||
let indent = " ".repeat(depth * 2);
|
||||
match node {
|
||||
Node::Heading { level, nodes, .. } => {
|
||||
println!("{}Heading (level {})", indent, level);
|
||||
visit(nodes, depth + 1);
|
||||
}
|
||||
Node::Text { value, .. } => {
|
||||
println!("{}Text: {:?}", indent, value);
|
||||
}
|
||||
Node::Link { target, text, .. } => {
|
||||
println!("{}Link: {}", indent, target);
|
||||
visit(text, depth + 1);
|
||||
}
|
||||
Node::ExternalLink { nodes, .. } => {
|
||||
println!("{}ExternalLink", indent);
|
||||
visit(nodes, depth + 1);
|
||||
}
|
||||
Node::UnorderedList { items, .. } => {
|
||||
println!("{}UnorderedList", indent);
|
||||
for item in items {
|
||||
visit(&item.nodes, depth + 1);
|
||||
}
|
||||
}
|
||||
Node::OrderedList { items, .. } => {
|
||||
println!("{}OrderedList", indent);
|
||||
for item in items {
|
||||
visit(&item.nodes, depth + 1);
|
||||
}
|
||||
}
|
||||
Node::DefinitionList { items, .. } => {
|
||||
println!("{}DefinitionList", indent);
|
||||
for item in items {
|
||||
visit(&item.nodes, depth + 1);
|
||||
}
|
||||
}
|
||||
Node::Image { target, text, .. } => {
|
||||
println!("{}Image: {}", indent, target);
|
||||
visit(text, depth + 1);
|
||||
}
|
||||
Node::Template { name, parameters, .. } => {
|
||||
println!("{}Template", indent);
|
||||
println!("{} Name:", indent);
|
||||
visit(name, depth + 2);
|
||||
for param in parameters {
|
||||
println!("{} Param", indent);
|
||||
if let Some(name) = ¶m.name {
|
||||
println!("{} Key:", indent);
|
||||
visit(name, depth + 3);
|
||||
}
|
||||
println!("{} Value:", indent);
|
||||
visit(¶m.value, depth + 3);
|
||||
}
|
||||
}
|
||||
Node::Parameter { name, default, .. } => {
|
||||
println!("{}Parameter Usage", indent);
|
||||
println!("{} Name:", indent);
|
||||
visit(name, depth + 2);
|
||||
if let Some(default_val) = default {
|
||||
println!("{} Default:", indent);
|
||||
visit(default_val, depth + 2);
|
||||
}
|
||||
}
|
||||
Node::Tag { name, nodes, .. } => {
|
||||
println!("{}Tag: <{}>", indent, name);
|
||||
visit(nodes, depth + 1);
|
||||
}
|
||||
Node::StartTag { name, .. } => {
|
||||
println!("{}StartTag: <{}>", indent, name);
|
||||
}
|
||||
Node::EndTag { name, .. } => {
|
||||
println!("{}EndTag: </{}>", indent, name);
|
||||
}
|
||||
Node::Preformatted { nodes, .. } => {
|
||||
println!("{}Preformatted", indent);
|
||||
visit(nodes, depth + 1);
|
||||
}
|
||||
Node::Table { rows, captions, .. } => {
|
||||
println!("{}Table", indent);
|
||||
for caption in captions {
|
||||
println!("{} Caption", indent);
|
||||
visit(&caption.content, depth + 2);
|
||||
}
|
||||
for row in rows {
|
||||
println!("{} Row", indent);
|
||||
for cell in &row.cells {
|
||||
println!("{} Cell", indent);
|
||||
visit(&cell.content, depth + 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
Node::Bold { .. } => println!("{}Bold", indent),
|
||||
Node::Italic { .. } => println!("{}Italic", indent),
|
||||
Node::BoldItalic { .. } => println!("{}BoldItalic", indent),
|
||||
Node::ParagraphBreak { .. } => println!("{}ParagraphBreak", indent),
|
||||
Node::HorizontalDivider { .. } => println!("{}HorizontalDivider", indent),
|
||||
Node::Category { target, ordinal, .. } => {
|
||||
println!("{}Category: {}", indent, target);
|
||||
visit(ordinal, depth + 1);
|
||||
}
|
||||
Node::Redirect { target, .. } => {
|
||||
println!("{}Redirect: {}", indent, target);
|
||||
}
|
||||
Node::Comment { .. } => println!("{}Comment", indent),
|
||||
Node::CharacterEntity { character, .. } => println!("{}CharacterEntity: {}", indent, character),
|
||||
Node::MagicWord { .. } => println!("{}MagicWord", indent),
|
||||
// _ => println!("{}Unknown Node: {:?}", indent, node),
|
||||
}
|
||||
}
|
||||
}
|
||||
0
src/parser/mod.rs
Normal file
0
src/parser/mod.rs
Normal file
Reference in New Issue
Block a user