Initial commit

This commit is contained in:
2026-01-25 16:47:18 -05:00
parent 81b73dd362
commit e8ab5e452e
14 changed files with 2361 additions and 0 deletions

311
src/lib.rs Normal file
View File

@@ -0,0 +1,311 @@
use mediawiki_rest_api::prelude::*;
use mediawiki_rest_api::rest_api_builder::RestApiBuilder;
use parse_wiki_text::{Configuration, Node, Output};
use std::collections::HashMap;
/// Fetches the raw wikitext of a Wikipedia article.
///
/// # Arguments
///
/// * `language` - The language code (e.g., "en", "fr").
/// * `title` - The title of the article.
///
/// # Returns
///
/// The raw wikitext content of the article.
pub async fn fetch_article(language: &str, title: &str) -> Result<String, String> {
let api = RestApiBuilder::wikipedia(language).build();
let page = Page::new(title);
match page.get(&api, false).await {
Ok((_, wikitext)) => Ok(wikitext),
Err(e) => Err(format!("Failed to fetch article: {}", e)),
}
}
/// Parses the wikitext into a structured format.
///
/// # Arguments
///
/// * `text` - The raw wikitext.
///
/// # Returns
///
/// The parsed output containing nodes.
pub fn parse(text: &str) -> Output {
let config = Configuration::default();
config.parse(text)
}
/// Extracts all interwiki links from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of link targets (e.g., "Page Title").
pub fn get_links(nodes: &[Node]) -> Vec<String> {
let mut links = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Link { target, .. } = node {
links.push(target.to_string());
}
});
links
}
/// Extracts all categories from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of category targets (e.g., "Category:Rust").
pub fn get_categories(nodes: &[Node]) -> Vec<String> {
let mut categories = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Category { target, .. } = node {
categories.push(target.to_string());
}
});
categories
}
/// Extracts all references from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of reference contents (e.g., the text inside <ref> tags).
pub fn get_references(nodes: &[Node]) -> Vec<String> {
let mut references = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Tag { name, nodes, .. } = node {
if name == "ref" {
let content = get_text_from_nodes(nodes);
if !content.is_empty() {
references.push(content);
}
}
}
});
references
}
/// Extracts all templates from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of template names (e.g., "Infobox").
pub fn get_templates(nodes: &[Node]) -> Vec<String> {
let mut templates = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Template { name, .. } = node {
let template_name = get_text_from_nodes(name).trim().to_string();
if !template_name.is_empty() {
templates.push(template_name);
}
}
});
templates
}
/// Extracts parameters from a specific template.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
/// * `template_name` - The name of the template to find.
///
/// # Returns
///
/// A map of parameter names to their values.
pub fn get_template_parameters(nodes: &[Node], template_name: &str) -> HashMap<String, String> {
let mut parameters = HashMap::new();
visit_nodes(nodes, &mut |node| {
if let Node::Template { name, parameters: params, .. } = node {
let current_template_name = get_text_from_nodes(name).trim().to_string();
if current_template_name.eq_ignore_ascii_case(template_name) {
for param in params {
let key = if let Some(name) = &param.name {
get_text_from_nodes(name).trim().to_string()
} else {
continue;
};
let value = get_text_from_nodes(&param.value).trim().to_string();
parameters.insert(key, value);
}
}
}
});
parameters
}
fn visit_nodes<F>(nodes: &[Node], callback: &mut F)
where
F: FnMut(&Node),
{
for node in nodes {
callback(node);
match node {
Node::Heading { nodes, .. } => visit_nodes(nodes, callback),
Node::Link { text, .. } => visit_nodes(text, callback),
Node::ExternalLink { nodes, .. } => visit_nodes(nodes, callback),
Node::Preformatted { nodes, .. } => visit_nodes(nodes, callback),
Node::Tag { nodes, .. } => visit_nodes(nodes, callback),
Node::Image { text, .. } => visit_nodes(text, callback),
Node::UnorderedList { items, .. } => {
for item in items {
visit_nodes(&item.nodes, callback);
}
}
Node::OrderedList { items, .. } => {
for item in items {
visit_nodes(&item.nodes, callback);
}
}
Node::DefinitionList { items, .. } => {
for item in items {
visit_nodes(&item.nodes, callback);
}
}
Node::Template { name, parameters, .. } => {
visit_nodes(name, callback);
for param in parameters {
if let Some(name) = &param.name {
visit_nodes(name, callback);
}
visit_nodes(&param.value, callback);
}
}
Node::Parameter { name, default, .. } => {
visit_nodes(name, callback);
if let Some(default_val) = default {
visit_nodes(default_val, callback);
}
}
Node::Table { rows, captions, .. } => {
for caption in captions {
visit_nodes(&caption.content, callback);
}
for row in rows {
for cell in &row.cells {
visit_nodes(&cell.content, callback);
}
}
}
Node::Category { ordinal, .. } => {
visit_nodes(ordinal, callback);
}
_ => {}
}
}
}
fn get_text_from_nodes(nodes: &[Node]) -> String {
let mut text = String::new();
for node in nodes {
match node {
Node::Text { value, .. } => text.push_str(value),
Node::Link { text: link_text, .. } => text.push_str(&get_text_from_nodes(link_text)),
Node::ExternalLink { nodes, .. } => text.push_str(&get_text_from_nodes(nodes)),
Node::Bold { .. } => {},
Node::Italic { .. } => {},
Node::BoldItalic { .. } => {},
Node::CharacterEntity { character, .. } => text.push(*character),
_ => {}
}
}
text
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_fetch_article_valid() {
let result = fetch_article("en", "Rust (programming language)").await;
assert!(result.is_ok());
let content = result.unwrap();
assert!(!content.is_empty());
}
#[tokio::test]
async fn test_fetch_article_invalid() {
let result = fetch_article("en", "ThisPageDoesNotExist_12345_XYZ").await;
assert!(result.is_err());
}
#[test]
fn test_parse() {
let text = "== Heading ==";
let output = parse(text);
assert!(!output.nodes.is_empty());
}
#[test]
fn test_get_links() {
let text = "Here is a [[Link]] and another [[Another Link|with text]].";
let output = parse(text);
let links = get_links(&output.nodes);
assert_eq!(links, vec!["Link", "Another Link"]);
}
#[test]
fn test_get_links_nested() {
let text = "* [[List Link]]\n* Item with [[Nested Link]]";
let output = parse(text);
let links = get_links(&output.nodes);
assert!(links.contains(&"List Link".to_string()));
assert!(links.contains(&"Nested Link".to_string()));
}
#[test]
fn test_get_categories() {
let text = "Some text. [[Category:Programming languages]] [[Category:Rust]]";
let output = parse(text);
let categories = get_categories(&output.nodes);
assert!(categories.contains(&"Category:Programming languages".to_string()));
assert!(categories.contains(&"Category:Rust".to_string()));
}
#[test]
fn test_get_references() {
let text = "Some statement.<ref>Source 1</ref> Another statement.<ref>Source 2</ref>";
let output = parse(text);
let references = get_references(&output.nodes);
assert_eq!(references.len(), 2);
assert!(references.contains(&"Source 1".to_string()));
assert!(references.contains(&"Source 2".to_string()));
}
#[test]
fn test_get_templates() {
let text = "{{Infobox person\n| name = Example\n}}\nSome text. {{Another template}}";
let output = parse(text);
let templates = get_templates(&output.nodes);
assert_eq!(templates.len(), 2);
assert!(templates.contains(&"Infobox person".to_string()));
assert!(templates.contains(&"Another template".to_string()));
}
#[test]
fn test_get_template_parameters() {
let text = "{{Infobox person\n| name = Example Name\n| age = 30\n}}";
let output = parse(text);
let params = get_template_parameters(&output.nodes, "Infobox person");
assert_eq!(params.get("name"), Some(&"Example Name".to_string()));
assert_eq!(params.get("age"), Some(&"30".to_string()));
}
}

181
src/main.rs Normal file
View File

@@ -0,0 +1,181 @@
use Brig::{fetch_article, parse, get_links, get_categories, get_references, get_templates, get_template_parameters};
use parse_wiki_text::Node;
#[tokio::main]
async fn main() {
// Example usage of the library
let language = "en";
let title = "Rust (programming language)";
println!("Fetching article '{}' from {} Wikipedia...", title, language);
match fetch_article(language, title).await {
Ok(wikitext) => {
println!("Successfully fetched article. Length: {}", wikitext.len());
let output = parse(&wikitext);
println!("Parsed article. Nodes: {}", output.nodes.len());
let links = get_links(&output.nodes);
println!("Found {} links:", links.len());
for (i, link) in links.iter().take(10).enumerate() {
println!(" {}. {}", i + 1, link);
}
if links.len() > 10 {
println!(" ... and {} more", links.len() - 10);
}
let categories = get_categories(&output.nodes);
println!("Found {} categories:", categories.len());
for (i, category) in categories.iter().enumerate() {
println!(" {}. {}", i + 1, category);
}
let references = get_references(&output.nodes);
println!("Found {} references:", references.len());
for (i, reference) in references.iter().take(5).enumerate() {
println!(" {}. {}", i + 1, reference);
}
if references.len() > 5 {
println!(" ... and {} more", references.len() - 5);
}
let templates = get_templates(&output.nodes);
println!("Found {} templates:", templates.len());
for (i, template) in templates.iter().take(10).enumerate() {
println!(" {}. {}", i + 1, template);
}
if templates.len() > 10 {
println!(" ... and {} more", templates.len() - 10);
}
let template_name_to_find = "Infobox programming language";
let template_params = get_template_parameters(&output.nodes, template_name_to_find);
println!("Found {} parameters for template '{}':", template_params.len(), template_name_to_find);
for (key, value) in template_params.iter().take(50) {
println!(" - {}: {}", key, value);
}
if template_params.len() > 50 {
println!(" ... and {} more", template_params.len() - 10);
}
// Visit and print nodes (demonstration)
// visit(&output.nodes, 0);
}
Err(e) => {
eprintln!("Error: {}", e);
}
}
}
fn visit(nodes: &[Node], depth: usize) {
for node in nodes {
let indent = " ".repeat(depth * 2);
match node {
Node::Heading { level, nodes, .. } => {
println!("{}Heading (level {})", indent, level);
visit(nodes, depth + 1);
}
Node::Text { value, .. } => {
println!("{}Text: {:?}", indent, value);
}
Node::Link { target, text, .. } => {
println!("{}Link: {}", indent, target);
visit(text, depth + 1);
}
Node::ExternalLink { nodes, .. } => {
println!("{}ExternalLink", indent);
visit(nodes, depth + 1);
}
Node::UnorderedList { items, .. } => {
println!("{}UnorderedList", indent);
for item in items {
visit(&item.nodes, depth + 1);
}
}
Node::OrderedList { items, .. } => {
println!("{}OrderedList", indent);
for item in items {
visit(&item.nodes, depth + 1);
}
}
Node::DefinitionList { items, .. } => {
println!("{}DefinitionList", indent);
for item in items {
visit(&item.nodes, depth + 1);
}
}
Node::Image { target, text, .. } => {
println!("{}Image: {}", indent, target);
visit(text, depth + 1);
}
Node::Template { name, parameters, .. } => {
println!("{}Template", indent);
println!("{} Name:", indent);
visit(name, depth + 2);
for param in parameters {
println!("{} Param", indent);
if let Some(name) = &param.name {
println!("{} Key:", indent);
visit(name, depth + 3);
}
println!("{} Value:", indent);
visit(&param.value, depth + 3);
}
}
Node::Parameter { name, default, .. } => {
println!("{}Parameter Usage", indent);
println!("{} Name:", indent);
visit(name, depth + 2);
if let Some(default_val) = default {
println!("{} Default:", indent);
visit(default_val, depth + 2);
}
}
Node::Tag { name, nodes, .. } => {
println!("{}Tag: <{}>", indent, name);
visit(nodes, depth + 1);
}
Node::StartTag { name, .. } => {
println!("{}StartTag: <{}>", indent, name);
}
Node::EndTag { name, .. } => {
println!("{}EndTag: </{}>", indent, name);
}
Node::Preformatted { nodes, .. } => {
println!("{}Preformatted", indent);
visit(nodes, depth + 1);
}
Node::Table { rows, captions, .. } => {
println!("{}Table", indent);
for caption in captions {
println!("{} Caption", indent);
visit(&caption.content, depth + 2);
}
for row in rows {
println!("{} Row", indent);
for cell in &row.cells {
println!("{} Cell", indent);
visit(&cell.content, depth + 3);
}
}
}
Node::Bold { .. } => println!("{}Bold", indent),
Node::Italic { .. } => println!("{}Italic", indent),
Node::BoldItalic { .. } => println!("{}BoldItalic", indent),
Node::ParagraphBreak { .. } => println!("{}ParagraphBreak", indent),
Node::HorizontalDivider { .. } => println!("{}HorizontalDivider", indent),
Node::Category { target, ordinal, .. } => {
println!("{}Category: {}", indent, target);
visit(ordinal, depth + 1);
}
Node::Redirect { target, .. } => {
println!("{}Redirect: {}", indent, target);
}
Node::Comment { .. } => println!("{}Comment", indent),
Node::CharacterEntity { character, .. } => println!("{}CharacterEntity: {}", indent, character),
Node::MagicWord { .. } => println!("{}MagicWord", indent),
// _ => println!("{}Unknown Node: {:?}", indent, node),
}
}
}

0
src/parser/mod.rs Normal file
View File