Initial commit

This commit is contained in:
2026-01-25 16:47:18 -05:00
parent 81b73dd362
commit e8ab5e452e
14 changed files with 2361 additions and 0 deletions

5
.gitignore vendored
View File

@@ -16,3 +16,8 @@ target/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Added by cargo
/target

10
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,10 @@
# Default ignored files
/shelf/
/workspace.xml
# Ignored default folder with query files
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

6
.idea/misc.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/Brig.iml" filepath="$PROJECT_DIR$/Brig.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

13
Brig.iml Normal file
View File

@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

1641
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

11
Cargo.toml Normal file
View File

@@ -0,0 +1,11 @@
[package]
name = "Brig"
version = "0.1.0"
edition = "2024"
[dependencies]
mediawiki_rest_api = "0.2.1"
parse_wiki_text = "0.1.5"
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros"] }

36
GEMINI.md Normal file
View File

@@ -0,0 +1,36 @@
## Critical Rules
### 1. Code Organization
- Many small files over few large files
- High cohesion, low coupling
- 200-400 lines typical, 800 max per file
- Organize by feature/domain, not by type
### 2. Code Style
- No emojis in code, comments, or documentation
- Immutability always - never mutate objects or arrays
- No console.log in production code
- Proper error handling with try/catch
- Input validation with Zod or similar
- Always follow existing coding style when adding new code
- Avoid nested matching pattern as much as possible
### 3. Regarding Dependencies:
- Avoid introducing new external dependencies unless absolutely necessary.
- If a new dependency is required, state the reason.
### 4. Adding tests
When adding tests for a particular feature, add the tests near where other tests
for similar code live. Try not to add new dependencies as you add tests, and try
to make new tests similar in style and API usage to other tests which already
exist nearby.
### 5. AI/Developer Rules
- **Rust Development Verification:** After modifying any Rust code (`.rs` files or `Cargo.toml`), you **MUST** run `cargo build --manifest-path cli/Cargo.toml` to verify compilation.
- If the build fails, you **MUST** analyze the error log and apply a fix immediately.
- Do **NOT** report the task as complete until the code compiles successfully.

34
spec.md Normal file
View File

@@ -0,0 +1,34 @@
# Spec: Wikimedia article manipulation library
**Goal**: Create a rust library providing high level and low level functions manipulating the properties
and the content of wikimedia pages. This library must ne built on top of the 2 existing rust libraries
parse_wiki_text and parse_wiki_text to implement those functionalities
**User Story 1**: As a user, I want to get the raw text content of a wikimedia article by specifying the site, language and article name.
**User Story 2**: As a user, I want to get the list of all the links of an article
**User Story 3**: As a user, I want to get the list of all the categories of an article
**User Story 4**: As a user, I want to get the list of all the references of an article
**User Story 5**: As a user, I want to get the list of all the templates of an article
**User Story 6**: As a user, I want to get the list of all the parameters of a template
**Functional Requirements**:
1. Use mediawiki_rest_api to get access to the wikimedia article
2. Use mediawiki_rest_api to get the initial content of the article
3. Use parse_wiki_text to parse the wikimedia article
4. Use parse_wiki_text to interact with any internal content of the article
5. Use parse_wiki_text nodes
6. Always write unit tests for every functions
**Technical Constraints**:
- rust language
- mediawiki_rest_api
- parse_wiki_text
**Test Cases**:
- **Scenario A**: Valid access -> Status 200 + content of the article.
- **Scenario B**: Valid access -> Status 200 + categories of the article.
- **Scenario C**: Valid access -> Status 200 + links of the article.
- **Scenario D**: Valid access -> Status 200 + references of the article.
- **Scenario E**: Valid access -> Status 200 + templates of the article.
- **Scenario F**: Valid access -> Status 200 + parameters of a template.
- **Scenario Z**: Incorrect article -> Status 401 + error message.

311
src/lib.rs Normal file
View File

@@ -0,0 +1,311 @@
use mediawiki_rest_api::prelude::*;
use mediawiki_rest_api::rest_api_builder::RestApiBuilder;
use parse_wiki_text::{Configuration, Node, Output};
use std::collections::HashMap;
/// Fetches the raw wikitext of a Wikipedia article.
///
/// # Arguments
///
/// * `language` - The language code (e.g., "en", "fr").
/// * `title` - The title of the article.
///
/// # Returns
///
/// The raw wikitext content of the article.
pub async fn fetch_article(language: &str, title: &str) -> Result<String, String> {
let api = RestApiBuilder::wikipedia(language).build();
let page = Page::new(title);
match page.get(&api, false).await {
Ok((_, wikitext)) => Ok(wikitext),
Err(e) => Err(format!("Failed to fetch article: {}", e)),
}
}
/// Parses the wikitext into a structured format.
///
/// # Arguments
///
/// * `text` - The raw wikitext.
///
/// # Returns
///
/// The parsed output containing nodes.
pub fn parse(text: &str) -> Output {
let config = Configuration::default();
config.parse(text)
}
/// Extracts all interwiki links from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of link targets (e.g., "Page Title").
pub fn get_links(nodes: &[Node]) -> Vec<String> {
let mut links = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Link { target, .. } = node {
links.push(target.to_string());
}
});
links
}
/// Extracts all categories from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of category targets (e.g., "Category:Rust").
pub fn get_categories(nodes: &[Node]) -> Vec<String> {
let mut categories = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Category { target, .. } = node {
categories.push(target.to_string());
}
});
categories
}
/// Extracts all references from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of reference contents (e.g., the text inside <ref> tags).
pub fn get_references(nodes: &[Node]) -> Vec<String> {
let mut references = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Tag { name, nodes, .. } = node {
if name == "ref" {
let content = get_text_from_nodes(nodes);
if !content.is_empty() {
references.push(content);
}
}
}
});
references
}
/// Extracts all templates from the parsed nodes.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
///
/// # Returns
///
/// A list of template names (e.g., "Infobox").
pub fn get_templates(nodes: &[Node]) -> Vec<String> {
let mut templates = Vec::new();
visit_nodes(nodes, &mut |node| {
if let Node::Template { name, .. } = node {
let template_name = get_text_from_nodes(name).trim().to_string();
if !template_name.is_empty() {
templates.push(template_name);
}
}
});
templates
}
/// Extracts parameters from a specific template.
///
/// # Arguments
///
/// * `nodes` - The list of nodes to traverse.
/// * `template_name` - The name of the template to find.
///
/// # Returns
///
/// A map of parameter names to their values.
pub fn get_template_parameters(nodes: &[Node], template_name: &str) -> HashMap<String, String> {
let mut parameters = HashMap::new();
visit_nodes(nodes, &mut |node| {
if let Node::Template { name, parameters: params, .. } = node {
let current_template_name = get_text_from_nodes(name).trim().to_string();
if current_template_name.eq_ignore_ascii_case(template_name) {
for param in params {
let key = if let Some(name) = &param.name {
get_text_from_nodes(name).trim().to_string()
} else {
continue;
};
let value = get_text_from_nodes(&param.value).trim().to_string();
parameters.insert(key, value);
}
}
}
});
parameters
}
fn visit_nodes<F>(nodes: &[Node], callback: &mut F)
where
F: FnMut(&Node),
{
for node in nodes {
callback(node);
match node {
Node::Heading { nodes, .. } => visit_nodes(nodes, callback),
Node::Link { text, .. } => visit_nodes(text, callback),
Node::ExternalLink { nodes, .. } => visit_nodes(nodes, callback),
Node::Preformatted { nodes, .. } => visit_nodes(nodes, callback),
Node::Tag { nodes, .. } => visit_nodes(nodes, callback),
Node::Image { text, .. } => visit_nodes(text, callback),
Node::UnorderedList { items, .. } => {
for item in items {
visit_nodes(&item.nodes, callback);
}
}
Node::OrderedList { items, .. } => {
for item in items {
visit_nodes(&item.nodes, callback);
}
}
Node::DefinitionList { items, .. } => {
for item in items {
visit_nodes(&item.nodes, callback);
}
}
Node::Template { name, parameters, .. } => {
visit_nodes(name, callback);
for param in parameters {
if let Some(name) = &param.name {
visit_nodes(name, callback);
}
visit_nodes(&param.value, callback);
}
}
Node::Parameter { name, default, .. } => {
visit_nodes(name, callback);
if let Some(default_val) = default {
visit_nodes(default_val, callback);
}
}
Node::Table { rows, captions, .. } => {
for caption in captions {
visit_nodes(&caption.content, callback);
}
for row in rows {
for cell in &row.cells {
visit_nodes(&cell.content, callback);
}
}
}
Node::Category { ordinal, .. } => {
visit_nodes(ordinal, callback);
}
_ => {}
}
}
}
fn get_text_from_nodes(nodes: &[Node]) -> String {
let mut text = String::new();
for node in nodes {
match node {
Node::Text { value, .. } => text.push_str(value),
Node::Link { text: link_text, .. } => text.push_str(&get_text_from_nodes(link_text)),
Node::ExternalLink { nodes, .. } => text.push_str(&get_text_from_nodes(nodes)),
Node::Bold { .. } => {},
Node::Italic { .. } => {},
Node::BoldItalic { .. } => {},
Node::CharacterEntity { character, .. } => text.push(*character),
_ => {}
}
}
text
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_fetch_article_valid() {
let result = fetch_article("en", "Rust (programming language)").await;
assert!(result.is_ok());
let content = result.unwrap();
assert!(!content.is_empty());
}
#[tokio::test]
async fn test_fetch_article_invalid() {
let result = fetch_article("en", "ThisPageDoesNotExist_12345_XYZ").await;
assert!(result.is_err());
}
#[test]
fn test_parse() {
let text = "== Heading ==";
let output = parse(text);
assert!(!output.nodes.is_empty());
}
#[test]
fn test_get_links() {
let text = "Here is a [[Link]] and another [[Another Link|with text]].";
let output = parse(text);
let links = get_links(&output.nodes);
assert_eq!(links, vec!["Link", "Another Link"]);
}
#[test]
fn test_get_links_nested() {
let text = "* [[List Link]]\n* Item with [[Nested Link]]";
let output = parse(text);
let links = get_links(&output.nodes);
assert!(links.contains(&"List Link".to_string()));
assert!(links.contains(&"Nested Link".to_string()));
}
#[test]
fn test_get_categories() {
let text = "Some text. [[Category:Programming languages]] [[Category:Rust]]";
let output = parse(text);
let categories = get_categories(&output.nodes);
assert!(categories.contains(&"Category:Programming languages".to_string()));
assert!(categories.contains(&"Category:Rust".to_string()));
}
#[test]
fn test_get_references() {
let text = "Some statement.<ref>Source 1</ref> Another statement.<ref>Source 2</ref>";
let output = parse(text);
let references = get_references(&output.nodes);
assert_eq!(references.len(), 2);
assert!(references.contains(&"Source 1".to_string()));
assert!(references.contains(&"Source 2".to_string()));
}
#[test]
fn test_get_templates() {
let text = "{{Infobox person\n| name = Example\n}}\nSome text. {{Another template}}";
let output = parse(text);
let templates = get_templates(&output.nodes);
assert_eq!(templates.len(), 2);
assert!(templates.contains(&"Infobox person".to_string()));
assert!(templates.contains(&"Another template".to_string()));
}
#[test]
fn test_get_template_parameters() {
let text = "{{Infobox person\n| name = Example Name\n| age = 30\n}}";
let output = parse(text);
let params = get_template_parameters(&output.nodes, "Infobox person");
assert_eq!(params.get("name"), Some(&"Example Name".to_string()));
assert_eq!(params.get("age"), Some(&"30".to_string()));
}
}

181
src/main.rs Normal file
View File

@@ -0,0 +1,181 @@
use Brig::{fetch_article, parse, get_links, get_categories, get_references, get_templates, get_template_parameters};
use parse_wiki_text::Node;
#[tokio::main]
async fn main() {
// Example usage of the library
let language = "en";
let title = "Rust (programming language)";
println!("Fetching article '{}' from {} Wikipedia...", title, language);
match fetch_article(language, title).await {
Ok(wikitext) => {
println!("Successfully fetched article. Length: {}", wikitext.len());
let output = parse(&wikitext);
println!("Parsed article. Nodes: {}", output.nodes.len());
let links = get_links(&output.nodes);
println!("Found {} links:", links.len());
for (i, link) in links.iter().take(10).enumerate() {
println!(" {}. {}", i + 1, link);
}
if links.len() > 10 {
println!(" ... and {} more", links.len() - 10);
}
let categories = get_categories(&output.nodes);
println!("Found {} categories:", categories.len());
for (i, category) in categories.iter().enumerate() {
println!(" {}. {}", i + 1, category);
}
let references = get_references(&output.nodes);
println!("Found {} references:", references.len());
for (i, reference) in references.iter().take(5).enumerate() {
println!(" {}. {}", i + 1, reference);
}
if references.len() > 5 {
println!(" ... and {} more", references.len() - 5);
}
let templates = get_templates(&output.nodes);
println!("Found {} templates:", templates.len());
for (i, template) in templates.iter().take(10).enumerate() {
println!(" {}. {}", i + 1, template);
}
if templates.len() > 10 {
println!(" ... and {} more", templates.len() - 10);
}
let template_name_to_find = "Infobox programming language";
let template_params = get_template_parameters(&output.nodes, template_name_to_find);
println!("Found {} parameters for template '{}':", template_params.len(), template_name_to_find);
for (key, value) in template_params.iter().take(50) {
println!(" - {}: {}", key, value);
}
if template_params.len() > 50 {
println!(" ... and {} more", template_params.len() - 10);
}
// Visit and print nodes (demonstration)
// visit(&output.nodes, 0);
}
Err(e) => {
eprintln!("Error: {}", e);
}
}
}
fn visit(nodes: &[Node], depth: usize) {
for node in nodes {
let indent = " ".repeat(depth * 2);
match node {
Node::Heading { level, nodes, .. } => {
println!("{}Heading (level {})", indent, level);
visit(nodes, depth + 1);
}
Node::Text { value, .. } => {
println!("{}Text: {:?}", indent, value);
}
Node::Link { target, text, .. } => {
println!("{}Link: {}", indent, target);
visit(text, depth + 1);
}
Node::ExternalLink { nodes, .. } => {
println!("{}ExternalLink", indent);
visit(nodes, depth + 1);
}
Node::UnorderedList { items, .. } => {
println!("{}UnorderedList", indent);
for item in items {
visit(&item.nodes, depth + 1);
}
}
Node::OrderedList { items, .. } => {
println!("{}OrderedList", indent);
for item in items {
visit(&item.nodes, depth + 1);
}
}
Node::DefinitionList { items, .. } => {
println!("{}DefinitionList", indent);
for item in items {
visit(&item.nodes, depth + 1);
}
}
Node::Image { target, text, .. } => {
println!("{}Image: {}", indent, target);
visit(text, depth + 1);
}
Node::Template { name, parameters, .. } => {
println!("{}Template", indent);
println!("{} Name:", indent);
visit(name, depth + 2);
for param in parameters {
println!("{} Param", indent);
if let Some(name) = &param.name {
println!("{} Key:", indent);
visit(name, depth + 3);
}
println!("{} Value:", indent);
visit(&param.value, depth + 3);
}
}
Node::Parameter { name, default, .. } => {
println!("{}Parameter Usage", indent);
println!("{} Name:", indent);
visit(name, depth + 2);
if let Some(default_val) = default {
println!("{} Default:", indent);
visit(default_val, depth + 2);
}
}
Node::Tag { name, nodes, .. } => {
println!("{}Tag: <{}>", indent, name);
visit(nodes, depth + 1);
}
Node::StartTag { name, .. } => {
println!("{}StartTag: <{}>", indent, name);
}
Node::EndTag { name, .. } => {
println!("{}EndTag: </{}>", indent, name);
}
Node::Preformatted { nodes, .. } => {
println!("{}Preformatted", indent);
visit(nodes, depth + 1);
}
Node::Table { rows, captions, .. } => {
println!("{}Table", indent);
for caption in captions {
println!("{} Caption", indent);
visit(&caption.content, depth + 2);
}
for row in rows {
println!("{} Row", indent);
for cell in &row.cells {
println!("{} Cell", indent);
visit(&cell.content, depth + 3);
}
}
}
Node::Bold { .. } => println!("{}Bold", indent),
Node::Italic { .. } => println!("{}Italic", indent),
Node::BoldItalic { .. } => println!("{}BoldItalic", indent),
Node::ParagraphBreak { .. } => println!("{}ParagraphBreak", indent),
Node::HorizontalDivider { .. } => println!("{}HorizontalDivider", indent),
Node::Category { target, ordinal, .. } => {
println!("{}Category: {}", indent, target);
visit(ordinal, depth + 1);
}
Node::Redirect { target, .. } => {
println!("{}Redirect: {}", indent, target);
}
Node::Comment { .. } => println!("{}Comment", indent),
Node::CharacterEntity { character, .. } => println!("{}CharacterEntity: {}", indent, character),
Node::MagicWord { .. } => println!("{}MagicWord", indent),
// _ => println!("{}Unknown Node: {:?}", indent, node),
}
}
}

0
src/parser/mod.rs Normal file
View File

99
tests/tests.rs Normal file
View File

@@ -0,0 +1,99 @@
use Brig::{fetch_article, parse, get_links, get_categories, get_references, get_templates, get_template_parameters};
#[cfg(test)]
mod tests {
use super::*;
async fn get_rust_article_content() -> String {
let result = fetch_article("en", "Rust (programming language)").await;
assert!(result.is_ok(), "Should successfully fetch a valid article");
let content = result.unwrap();
assert!(!content.is_empty(), "Content should not be empty");
content
}
#[tokio::test]
async fn test_fetch_article_valid() {
// Scenario A: Valid access
let content = get_rust_article_content().await;
assert!(!content.is_empty());
}
#[tokio::test]
async fn test_fetch_article_invalid() {
// Scenario Z: Incorrect article
let result = fetch_article("en", "ThisPageDoesNotExist_12345_XYZ").await;
assert!(result.is_err(), "Should fail to fetch a non-existent article");
}
#[tokio::test]
async fn test_parse_article() {
let content = get_rust_article_content().await;
let parsed = parse(&content);
assert!(!parsed.nodes.is_empty(), "Should parse fetched content");
}
#[tokio::test]
async fn test_get_links_integration() {
// Scenario C: Valid access -> Status 200 + links of the article.
let content = get_rust_article_content().await;
let parsed = parse(&content);
let links = get_links(&parsed.nodes);
assert!(!links.is_empty(), "Should find links in the Rust article");
}
#[tokio::test]
async fn test_get_categories_integration() {
// Scenario B: Valid access -> Status 200 + categories of the article.
let content = get_rust_article_content().await;
let parsed = parse(&content);
let categories = get_categories(&parsed.nodes);
assert!(!categories.is_empty(), "Should find categories in the Rust article");
}
#[tokio::test]
async fn test_get_references_integration() {
// Scenario D: Valid access -> Status 200 + references of the article.
let content = get_rust_article_content().await;
let parsed = parse(&content);
let references = get_references(&parsed.nodes);
assert!(!references.is_empty(), "Should find references in the Rust article");
}
#[tokio::test]
async fn test_get_templates_integration() {
// Scenario E: Valid access -> Status 200 + templates of the article.
let content = get_rust_article_content().await;
let parsed = parse(&content);
let templates = get_templates(&parsed.nodes);
assert!(!templates.is_empty(), "Should find templates in the Rust article");
}
#[tokio::test]
async fn test_get_template_parameters_integration() {
// Scenario F: Valid access -> Status 200 + parameters of a template.
let content = get_rust_article_content().await;
let parsed = parse(&content);
let templates = get_templates(&parsed.nodes);
// "Infobox programming language" is likely present on the Rust page.
let target_template = "Infobox programming language";
if templates.contains(&target_template.to_string()) {
let params = get_template_parameters(&parsed.nodes, target_template);
assert!(!params.is_empty(), "Should find parameters for Infobox programming language");
} else {
// Fallback: try to find *any* template that has parameters if the specific one isn't found
// This makes the test more robust to content changes on Wikipedia
let mut found_params = false;
for template_name in templates {
let params = get_template_parameters(&parsed.nodes, &template_name);
if !params.is_empty() {
found_params = true;
break;
}
}
assert!(found_params, "Should find parameters for at least one template");
}
}
}