Rust Tutorial

Installation

Add a dependency to graphANNIS in you Cargo.toml file:

graphannis = "0.30.0"

API documentation

The API documentation is available at https://docs.rs/graphannis/.

Corpus data directory

Data is organized in corpora, where each corpus has a name and annotations can only refer to other annotations in the same corpus. A CorpusStorage is used to access a collection corpora by their name.

use graphannis::CorpusStorage;
use std::path::PathBuf;

fn main() {
    let cs = CorpusStorage::with_auto_cache_size(&PathBuf::from("data"), true).unwrap();
    let corpora = cs.list().unwrap();
    let corpus_names: Vec<String> = corpora
        .into_iter()
        .map(|corpus_info| corpus_info.name)
        .collect();
    println!("{:?}", corpus_names);
}

This will print an empty list, because no corpora have been created yet. In this example, the CorpusStorage uses the sub-directory data of the current working directory to store the corpora. You can also use an absolute path as argument:

let cs = CorpusStorage::with_auto_cache_size(&PathBuf::from("/tmp/graphannis-data"), true)?;

Only one process can access a graphANNIS data directory, other processes will fail to open it if there is another process holding a lock. The CorpusStorage is thread-safe, thus multiple threads of the same process can call all functions in parallel.

Adding corpus data

Linguistic annotations as represented in graphANNIS as directed graphs (see the data model section for more information). You can add nodes and edges via the apply_update(...) function. It takes the corpus name and a list of graph updates as argument. These graph update lists are represented by the class GraphUpdate. E.g the following code creates a graph update for the tokenized sentence “That is a Category 3 storm.”. Normally, you would not add all events manually in the source code, which gets a bit verbose, but have input data that you map to update events. The resulting GraphUpdate object can then be used with the apply_update(...) function to insert the changes into the corpus.

use graphannis::update::{GraphUpdate, UpdateEvent};
use graphannis::CorpusStorage;
use std::path::PathBuf;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let cs = CorpusStorage::with_auto_cache_size(&PathBuf::from("data"), true).unwrap();

    let mut g = GraphUpdate::new();

    // First add the node (with the default type "node"),
    // then all node labels for the node.
    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t1".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t1".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: "That".to_string(),
    })?;

    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t2".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t2".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: "is".to_string(),
    })?;

    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t3".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t3".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: "a".to_string(),
    })?;

    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t4".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t4".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: "Category".to_string(),
    })?;

    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t5".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t5".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: "3".to_string(),
    })?;

    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t6".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t6".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: "storm".to_string(),
    })?;

    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1#t7".to_string(),
        node_type: "node".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNodeLabel {
        node_name: "tutorial/doc1#t7".to_string(),
        anno_ns: "annis".to_string(),
        anno_name: "tok".to_string(),
        anno_value: ".".to_string(),
    })?;

    // Add the ordering edges to specify token order.
    // The names of the source and target nodes are given as in the enum as fields,
    // followed by the component layer, type and name.
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t1".to_string(),
        target_node: "tutorial/doc1#t2".to_string(),
        layer: "annis".to_string(),
        component_type: "Ordering".to_string(),
        component_name: "".to_string(),
    })?;

    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t2".to_string(),
        target_node: "tutorial/doc1#t3".to_string(),
        layer: "annis".to_string(),
        component_type: "Ordering".to_string(),
        component_name: "".to_string(),
    })?;

    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t3".to_string(),
        target_node: "tutorial/doc1#t4".to_string(),
        layer: "annis".to_string(),
        component_type: "Ordering".to_string(),
        component_name: "".to_string(),
    })?;

    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t4".to_string(),
        target_node: "tutorial/doc1#t5".to_string(),
        layer: "annis".to_string(),
        component_type: "Ordering".to_string(),
        component_name: "".to_string(),
    })?;

    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t5".to_string(),
        target_node: "tutorial/doc1#t6".to_string(),
        layer: "annis".to_string(),
        component_type: "Ordering".to_string(),
        component_name: "".to_string(),
    })?;

    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t6".to_string(),
        target_node: "tutorial/doc1#t7".to_string(),
        layer: "annis".to_string(),
        component_type: "Ordering".to_string(),
        component_name: "".to_string(),
    })?;

    // Insert the changes in the corpus with the name "tutorial"
    cs.apply_update("tutorial", &mut g).unwrap();

    // List newly created corpus
    let corpora = cs.list().unwrap();
    let corpus_names: Vec<String> = corpora
        .into_iter()
        .map(|corpus_info| corpus_info.name)
        .collect();
    println!("{:?}", corpus_names);

    Ok(())
}

You could add additional annotations like part of speech as labels on nodes. For labels on edges, you can use the UpdateEvent::AddEdgeLabel enumeration variant.

Querying

There are two functions to query a corpus with AQL:

  • count(...) returns the number of matches, and
  • find(...) returns a paginated list of matched node IDs.

You have to give the list of corpora and the query as arguments to both functions. The following example searches for all tokens that contain a s character.1

use graphannis::corpusstorage::{QueryLanguage, ResultOrder};
use graphannis::CorpusStorage;
use std::path::PathBuf;

fn main() {
    let cs = CorpusStorage::with_auto_cache_size(&PathBuf::from("data"), true).unwrap();
    let number_of_matches = cs
        .count(&["tutorial"], "tok=/.*s.*/", QueryLanguage::AQL)
        .unwrap();
    println!("Number of matches: {}", number_of_matches);

    let matches = cs
        .find(
            &["tutorial"],
            "tok=/.*s.*/",
            QueryLanguage::AQL,
            0,
            Some(100),
            ResultOrder::Normal,
        )
        .unwrap();
    for (i, m) in matches.iter().enumerate() {
        println!("Match {}: {}", i, m);
    }
}

Output:

Number of matches: 2
Match 0: tutorial/doc1#t2
Match 1: tutorial/doc1#t6

Getting subgraphs

The result from the find(...) function can be used to generate a subgraph for the matches. It will contain all covered nodes of the matches and additionally a given context (defined in tokens).

use graphannis::corpusstorage::{QueryLanguage, ResultOrder};
use graphannis::util;
use graphannis::CorpusStorage;
use std::path::PathBuf;

fn main() {
    let cs = CorpusStorage::with_auto_cache_size(&PathBuf::from("data"), true).unwrap();
    let matches = cs
        .find(
            &["tutorial"],
            "tok . tok",
            QueryLanguage::AQL,
            0,
            Some(100),
            ResultOrder::Normal,
        )
        .unwrap();
    for m in matches {
        println!("{}", m);
        // convert the match string to a list of node IDs
        let node_names = util::node_names_from_match(&m);
        let g = cs.subgraph("tutorial", node_names, 2, 2, None).unwrap();
        // find all nodes of type "node" (regular annotation nodes)
        let node_search =
            g.get_node_annos()
                .exact_anno_search(Some("annis"), "node_type", Some("node").into());
        println!("Number of nodes in subgraph: {}", node_search.count());
    }
}

Output:

tutorial/doc1#t1 tutorial/doc1#t2
Number of nodes in subgraph: 4
tutorial/doc1#t2 tutorial/doc1#t3
Number of nodes in subgraph: 5
tutorial/doc1#t3 tutorial/doc1#t4
Number of nodes in subgraph: 6
tutorial/doc1#t4 tutorial/doc1#t5
Number of nodes in subgraph: 6
tutorial/doc1#t5 tutorial/doc1#t6
Number of nodes in subgraph: 5
tutorial/doc1#t6 tutorial/doc1#t7
Number of nodes in subgraph: 4

The result object of the subgraph(...) function is the type Graph, which provides basic graph access functions (see the API documentation for details).

Note: The subgraph(...) function takes a single corpus name as argument instead of a list, so you need to know to which corpus a matched node belongs to.

Normally a corpus is structured into subcorpora and documents. GraphANNIS uses node types and relations of type PartOf to model the corpus structure. If you have document nodes and the PartOf relation between the annotation nodes and its document, you can use the subcorpus_graph(...) function to get all annotation nodes for a given list of document names.

use graphannis::update::{GraphUpdate, UpdateEvent};
use graphannis::CorpusStorage;
use std::path::PathBuf;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let cs = CorpusStorage::with_auto_cache_size(&PathBuf::from("data"), true).unwrap();
    let mut g = GraphUpdate::new();
    // create the corpus and document node
    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial".to_string(),
        node_type: "corpus".to_string(),
    })?;
    g.add_event(UpdateEvent::AddNode {
        node_name: "tutorial/doc1".to_string(),
        node_type: "corpus".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1".to_string(),
        target_node: "tutorial".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    // add the corpus structure to the existing nodes
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t1".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t2".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t3".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t4".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t5".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t6".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    g.add_event(UpdateEvent::AddEdge {
        source_node: "tutorial/doc1#t7".to_string(),
        target_node: "tutorial/doc1".to_string(),
        layer: "annis".to_string(),
        component_type: "PartOf".to_string(),
        component_name: "".to_string(),
    })?;
    // apply the changes
    cs.apply_update("tutorial", &mut g).unwrap();

    // get the whole document as graph
    let subgraph = cs
        .subcorpus_graph("tutorial", vec!["tutorial/doc1".to_string()])
        .unwrap();
    let node_search = subgraph.get_node_annos().exact_anno_search(
        Some("annis"),
        "node_type",
        Some("node").into(),
    );
    for m in node_search {
        // get the numeric node ID from the match
        let id = m.node;
        // get the node name from the ID by searching for the label with the name "annis::node_name"
        let matched_node_name = subgraph
            .get_node_annos()
            .get_annotations_for_item(&id)
            .into_iter()
            .filter(|anno| anno.key.ns == "annis" && anno.key.name == "node_name")
            .map(|anno| anno.val)
            .next()
            .unwrap();
        println!("{}", matched_node_name);
    }

    Ok(())
}

Output:

tutorial/doc1#t2
tutorial/doc1#t4
tutorial/doc1#t5
tutorial/doc1#t6
tutorial/doc1#t7
tutorial/doc1#t1
tutorial/doc1#t3
1

You can get an overview of AQL here or detailed information in the User Guide.