RELAX NG (Compact) Schema for OTMI
From OpenTextMining
Return to OTMI Specification
This annex defines a RELAX NG (Conpact) schema specification for OTMI.
namespace otmi = "http://www.nature.com/schema/2006/03/otmi"
start = data
## The "data" element added to an Atom Entry document
data =
element otmi:data {
attribute version { xsd:NMTOKEN },
stoplist?,
sections,
floats?,
references?
}
## Stoplist is an optional element and references by URI
## the stoplist document used
stoplist =
element otmi:stoplist {
attribute href { xsd:anyURI },
empty
}
## Sections
sections =
section+
## Section is either a front section ("abstract" | "standfirst")
## and otmi-text content, or else a body section ("body") and child
## sections ("firstpara" | "methods" | "conclusions" | "others")
## and otmi-text content
section =
element otmi:section {
attribute name { xsd:NCName },
(otmi-text | section+)
}
## Floats - for now just figures and tables are included
## (other floating objects could be added)
floats =
figures | tables
## Figures element includes figure titles and captions
figures =
element otmi:figure {
title, caption
}+
## Tables element includes table titles
tables =
element otmi:table {
title
}+
title =
element otmi:title { otmi-text }
caption =
element otmi:caption { otmi-text }
## References - no text is provided but URI references
references =
element otmi:references {
## The "ref-id" element references documents by URI
element otmi:ref-id { xsd:anyURI }+,
## The "refs-noid" element provides count of references
## with no URI
element otmi:refs-noid { xsd:integer }
}
## OTMI Text - This is the actual payload for an OTMI file
otmi-text =
vectors?, snippets?, full-text?
## Vectors is a table listing word vectors
vectors =
element otmi:vectors {
attribute number { xsd:integer },
split-regex,
element otmi:vector {
attribute count { xsd:integer },
text
}+
}
## Snippets is a table listing text snippets
snippets =
element otmi:snippets {
attribute number { xsd:integer },
split-regex,
element otmi:snippet {
text
}+
}
## The "split-regex" expression used to split text
split-regex = element otmi:split-regex { text }
## Full text is either with stopwords removed or without
full-text =
reduced-text | raw-text
## The "reduced-text" element provides arbitrary text cleaned of markup
## and with stopwords removed
reduced-text = element otmi:reduced-text { text }
## The "raw-text" element provides arbitrary text cleaned of markup
## but without stopwords removed
raw-text = element otmi:raw-text { text }
