<!--
################################################################################
File:   gamexml.dtd
Author: Colin Wiel
Date:   07/01/2002

Overview:

<game>
   The root element, represents the curation of one or more sequences of
   DNA, RNA, or AA.  Most commonly, the <game> element represents the
   curation of a single sequence.

<seq>
   Represents a sequence of DNA, RNA, or AA.  There is generally one
   <seq> in the document representing the primary sequence being curated, and
   other <seq>'s that support the curation of the primary sequence.  The
   primary <seq> is directly under the <game> element, and is identified by
   having its "focus" attribute set to true.  Each <seq> has one or more
   <db_xref>'s to indicate where the seq can be found using a particular unique
   identifier.

   For Drosophila curation, BDGP uses the primary <seq> to represent an
   accession, and other <seq>'s to represent cDNA's, protein coding sequences,
   and homologous sequences that are referenced by computational analyses such
   as tblastx.

<annotation>
   Represents a set of related seqence features and a collection of genetic
   information describing them.  The term "sequence feature" means a segment
   of DNA.  An annotation will generally contain a number of <feature_set>'s,
   each of which represents a set of related sequence features that have a
   specific location.  A <feature_set> can contain nested <feature_set>'s
   (although in practice this has not yet occured), as well as one or more
   <feature_span>'s, each of which represents an individual sequence feature.
   A <feature_span> can contain <evidence> which specifies a result id and
   result type.  An <annotation> can have one or more <db_xref>'s.

   For Drosophila curation, the types of annotations are: gene, pseudogene,
   transposon, tRNA, rRNA, snRNA, snoRNA, "misc. non-coding RNA", and
   "miscellaneous curator's observation".  For an <annotation> of type "gene",
   one <feature_set> element represents each transcript, and for each
   transcript, one <feature_span> element represents each exon.

<computational_analysis>
   Contains evidence from computational analysis programs such sim4 and blastx.
   <result_set>'s and <result_span>'s represent a tree structure of results,
   with <result_set>'s representing branch nodes (e.g. gene matches), and
   <result_span>'s representing leaf nodes (e.g. exon matches).  The elements
   <feature_set>, <feature_span>, <result_set>, <result_span> run parallel to
   one another. Both allow multiple levels of nesting, both have physical
   location(s) on sequences. The key differences are that 'features' have
   results as evidence and 'results' have some form of an associated score for
   the assay.  <seq_relationship>'s provides the locations on the underlying
   <seq>'s.
################################################################################
-->

<?xml version="1.0" encoding="UTF-8" ?>

<!--
################################################################################
<game>
   The root element, represents the curation of one or more sequences of
   DNA, RNA, or AA.  Most commonly, the <game> element represents the
   curation of a single sequence.
################################################################################
-->
<!ELEMENT game ( seq+, map_position, annotation*, computational_analysis* ) >
<!ATTLIST game version NMTOKEN #REQUIRED >

<!--
################################################################################
<seq>
   Represents a sequence of DNA, RNA, or AA.  There is generally one
   <seq> in the document representing the primary sequence being curated, and
   other <seq>'s that support the curation of the primary sequence.  The
   primary <seq> is directly under the <game> element, and is identified by
   having its "focus" attribute set to true.  Each <seq> has one or more
   <db_xref> to indicate where the seq can be found using a particular unique
   identifier.

   For Drosophila curation, BDGP uses the primary <seq> to represent an
   accession, and other <seq>'s to represent cDNA's, protein coding sequences,
   and homologous sequences that are referenced by computational analyses such
   as tblastx.
################################################################################
-->
<!ELEMENT seq ( name, description?, residues?, dbxref*, organism? ) >
<!ATTLIST seq version CDATA #REQUIRED >
<!--  ** Flag for discussion - type: molecule / alphabet  -->
<!ATTLIST seq type (AA | DNA | RNA) #IMPLIED >
<!ATTLIST seq length NMTOKEN #REQUIRED >
<!ATTLIST seq md5checksum CDATA #IMPLIED >
<!--  ** Flag for discussion - unique id?  Same as name?  -->
<!ATTLIST seq id IDREF #REQUIRED >

<!ELEMENT name ( #PCDATA ) >

<!ELEMENT description ( #PCDATA ) >

<!ELEMENT residues ( #PCDATA ) >

<!--
################################################################################
<dbxref>
   Represents a database cross-reference.  The <xref_db> is the name of the
   database (e.g. EMBL), and the <db_xref_id> is the unique id within that
   database.
################################################################################
-->
<!ELEMENT dbxref ( xref_db, db_xref_id ) >

<!ELEMENT xref_db ( #PCDATA ) >

<!ELEMENT db_xref_id ( #PCDATA ) >

<!--  ** Flag for discussion - organism: IDREF?  (species,genus)? -->
<!ELEMENT organism ( #PCDATA ) >

<!--
################################################################################
<map_position>  ** Flag for discussion - deprecate?
   Maps a sequence onto a chromosome.

   For Drosophila curation, only one <map_position> is used per file, to
   specify the location of the primary sequence on a chromosome.
################################################################################
-->
<!ELEMENT map_position ( arm, span ) >
<!ATTLIST map_position seq NMTOKEN #REQUIRED >
<!ATTLIST map_position type NMTOKEN #REQUIRED >

<!ELEMENT arm ( #PCDATA ) >

<!--  ** Flag for discussion - (start, end, strand)? -->
<!ELEMENT span ( start, end ) >

<!--  ** Flag for discussion - space based coordinates? -->
<!ELEMENT start ( #PCDATA ) >

<!ELEMENT end ( #PCDATA ) >

<!--
################################################################################
<annotation>
   Represents a set of related seqence features and a collection of genetic
   information describing them.  The term "sequence feature" means a segment
   of DNA.  An annotation will generally contain a number of <feature_set>'s,
   each of which represents a set of related sequence features that have a
   specific location.  A <feature_set> can contain nested <feature_set>'s
   (although in practice this has not yet occured), as well as one or more
   <feature_span>'s, each of which represents an individual sequence feature.
   A <feature_span> can contain <evidence> which specifies a result id and
   result type.  An <annotation> can have one or more <db_xref>'s.

   For Drosophila curation, the types of annotations are: gene, pseudogene,
   transposon, tRNA, rRNA, snRNA, snoRNA, "misc. non-coding RNA", and
   "miscellaneous curator's observation".  For an <annotation> of type "gene",
   one <feature_set> element represents each transcript, and for each
   transcript, one <feature_span> element represents each exon.
################################################################################
-->
<!ELEMENT annotation ( name, type, property*, gene?, dbxref*, aspect+, feature_set+ ) >
<!ATTLIST annotation id NMTOKEN #REQUIRED >

<!ELEMENT synonym ( #PCDATA ) >

<!ELEMENT comment (text, person?, date?)>
<!ATTLIST comment internal NMTOKEN #IMPLIED >

<!--  ** Flag for discussion - type used in many ways -->
<!--  **    - add attribute such as "termid=SO:001"? -->
<!ELEMENT type ( #PCDATA ) >

<!ELEMENT property ( type, value ) >

<!ELEMENT value ( #PCDATA ) >

<!--
################################################################################
<gene>  ** Flag for discussion - deprecate. -->
   Not really necessary, since the <type> of the <annotation> indicates if the
   annotation is a gene, the <dbxref> of the annotation can contain the same
   information that the <dbxref> of the <gene> contains.
################################################################################
-->
<!ELEMENT gene ( name, dbxref ) >
<!ATTLIST gene association NMTOKEN #REQUIRED >

<!--
################################################################################
<aspect>  ** Flag for discussion - deprecate?.
   Used for GO terms in an annotation.  Each <aspect> will have a <dbxref> with 
  <xref_id> of "GO" and <db_xref_id> of the GO id, such as "GO:0005576".
################################################################################
-->
<!ELEMENT aspect ( dbxref, property) >

<!--
################################################################################
<feature_set>
   Represents a set of sequence features (segments of DNA).

   For Drosophila curation, a <feature_set> represents a transcript, and
   contains a number of <feature_span>'s which represent the start codon and
   the exons.  Each <feature_set> typically contains two <seq>'s, one for the
   cDNA sequence and one for the protien coding sequence of the transcript.
################################################################################
-->
<!ELEMENT feature_set ( name, type, feature_span+, seq+, description?, author
                        date, synonym+, comment+, property+, evidence+,
                        seq_relationship+) >
<!ATTLIST feature_set problem NMTOKEN #REQUIRED >
<!ATTLIST feature_set id NMTOKEN #REQUIRED >

<!ELEMENT author ( #PCDATA ) >

<!ELEMENT evidence >
<!ATTLIST evidence result NMTOKEN #REQUIRED >
<!ATTLIST evidence type NMTOKEN #REQUIRED >

<!--
################################################################################
<feature_span>
   Represents a sequence feature (segment of DNA), including its location,
   which is specified in a <seq_relationship>.  Can contain <evidence> which
   specifies a result id and result type.

   For Drosophila curation, a <feature_span> represents an exon or a start
   codon.  Each <feature_span> contains a <seq_relationship> of type "query"
   specifying the location on the primary sequence of the document, which is
   typically an accession.
################################################################################
-->
<!ELEMENT feature_span ( type, seq_relationship+, name? ) >
<!ATTLIST feature_span produces_seq NMTOKEN #IMPLIED >
<!ATTLIST feature_span id NMTOKEN #REQUIRED >

<!--
################################################################################
<seq_relationship>
   Provides the locations on the underlying <seq>'s.  Every 
<seq_relationship>
   absolutely requires a seq_id reference.  There are no hidden assumptions
   regarding which sequence the positoins refer to - it is firmly explicit.
   This makes it possible, among other things, to have the same feature appear
   on multiple sequences.
################################################################################
-->
<!ELEMENT seq_relationship ( span, alignment? ) >
<!ATTLIST seq_relationship seq CDATA #REQUIRED >
<!ATTLIST seq_relationship type ( subject | query ) #IMPLIED >

<!--  ** Flag for discussion - free text?  What are legal strings? -->
<!ELEMENT alignment ( #PCDATA ) >

<!--
################################################################################
<computational_analysis>
   Contains evidence from computational analysis programs such sim4 and blastx.
   <result_set>'s and <result_span>'s represent a tree structure of results,
   with <result_set>'s representing branch nodes (e.g. gene matches), and
   <result_span>'s representing leaf nodes (e.g. exon matches).  The elements
   <feature_set>, <feature_span>, <result_set>, <result_span> run parallel to
   one another. Both allow multiple levels of nesting, both have physical
   location(s) on sequences. The key differences are that 'features' have
   results as evidence and results have some form of an associated score for
   the assay.  <seq_relationship>'s provides the locations on the underlying
   <seq>'s.
################################################################################
-->
<!ELEMENT computational_analysis ( program, database, date?, 
result_set*, property* ) >

<!--  ** Flag for discussion - add version to program? -->
<!ELEMENT program ( #PCDATA ) >

<!ELEMENT database ( #PCDATA ) >

<!ELEMENT date ( #PCDATA ) >
<!ATTLIST date timestamp NMTOKEN #IMPLIED >

<!ELEMENT version ( #PCDATA ) >

<!ELEMENT result_set ( name, result_span+ ) >
<!ATTLIST result_set id ID #IMPLIED >

<!--  ** Flag for discussion - add input as well? -->
<!ELEMENT result_span ( output+, seq_relationship+, type?, score? ) >
<!ATTLIST result_span id ID #IMPLIED >

<!--  ** Flag for discussion - make output a property? -->
<!ELEMENT output ( type, value ) >

<!--  ** Flag for discussion - make score output type? -->
<!ELEMENT score ( #PCDATA ) >
