default namespace = "" namespace dc = "http://purl.org/dc/elements/1.1/" namespace pm = "http://www.politicalmashup.nl" # Our newspaper xml-files are always contained within a pm:KBroot element. start = KBRoot | PmRoot # <pm:KBroot> is the container (xml-file-root) for a set of articles. KBRoot = element pm:KBroot { RecordFile, PmRoot* # '*' for files created with criteria that did not match any article. } # @recordfile contains the absolute path of the file with article meta-data during processing. # It is present in aggregate xml documents that are the direct result of initial processing. # For derived xml's, it can/should be used as file identifier (e.g. "1990" for an xml with all articles from 1990). RecordFile = attribute recordfile { text } # <pm:root> contains all information of exactly one article. # For many retrieval purposes, one such pm:root counts as "one document". PmRoot = element pm:root { PmDocinfo, PmMeta, PmContent } # <pm:docinfo> is used in other PoliticalMashup data, but is empty for the newspapers. PmDocinfo = element pm:docinfo { empty } # <pm:meta> contains meta-data on about the article, namely publication date, type of article, unique identifier and source information. PmMeta = element pm:meta { DcDate, DcSubject, DcIdentifier, DcSource } # <dc:date> contains the original date of publication of the article, i.e. the publication date of the specific paper it appeared in. DcDate = element dc:date { xsd:date } # <dc:subject> contains the type of the article; currently known options: # advertentie, artikel, familiebericht, illustratie met onderschrift DcSubject = element dc:subject { "advertentie" | "artikel" | "familiebericht" | "illustratie met onderschrift" } # <dc:identifier> unique identifier for this article, DcIdentifier = element dc:identifier { IdentifierToken } # String that represents a unique identifier for a newspaper article, and is resolvable at the KB. # e.g. ddd:010567623:mpeg21:p001:a0001 @ http://kranten.kb.nl/view/article/id/ddd:010567623:mpeg21:p001:a0001 # The structure of the identifiers is probably: ddd:<unique-id-for-single-newspaper>:mpeg21:p<unique-page-id-in-paper>:a<unique-article-id-in-paper> IdentifierToken = xsd:token { pattern = "ddd:[0-9]+:mpeg21:p[0-9]+:a[0-9]+" } # <dc:source> is a one-time-recursive element, that contains the newspaper source in which the article appeared. DcSource = element dc:source { ( DcSource | PmLink ) } # <pm:link> contains the textual name and integer id of the newspaper in which the article appeared. # @pm:description contains the textual name of the newspaper (e.g. "Leeuwarder courant : hoofdblad van Friesland") # @pm:source contains the unique integer id for the newspaper in the KB catalogue (e.g. "865061483"). # According to the documentation, the @pm:source is: # "PPN: dit is het identificatienummer van de krant in de algemene catalogus (Gemeenschappelijk Geautomatiseerd Catalogussysteem, GGC)" PmLink = element pm:link { attribute pm:description { text }, # attribute pm:source { xsd:integer } # Sometimes the pm:source ends with an X (and maybe other reasons why xsd:integer is invalid). attribute pm:source { xsd:token { pattern = "[0-9]+X?" } } } # <pm:content> contains the actual content of the article (title, text and some attributes). PmContent = element pm:content { ContentPmId, PmSource, Title, Text } # @pm:id of the PmContent element is the main identifier of the content, and equal to the DcIdentifier in the PmMeta of the PmRoot of the same article. ContentPmId = attribute pm:id { IdentifierToken } # @pm:source contains the actual url of the article data (image + meta-data) at the source. PmSource = attribute pm:source { xsd:anyURI } # <title> contains the title of the article (either text, or also often empty). Title = element title { TextualContent } # Actual extual content, that can be searched or indexed, and uniquely identified by an id. TextualContent = ( SubContentPmId, text ) # @pm:id of Title and Paragraph elements are always the IdentifierToken plus some suffix. # TODO: merge, at least conceptually, IdentifierToken, ContentPmId and SubContentPmId. SubContentPmId = attribute pm:id { xsd:token { pattern = "ddd:[0-9]+:mpeg21:p[0-9]+:a[0-9]+.[t0-9.]+" } } # <text> contains all (zero or more) paragraphs of actual textual content for the article. # The only time the text contains no paragraphs, is if there was some problem (either not available or broken source data). Text = element text { Paragraph* } # <p> contains one line/paragraph of actual textual content of an article. Paragraph = element p { TextualContent }