Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision | ||
user:zeman:treebanks:ro [2012/01/12 17:11] zeman References, domain and size. |
user:zeman:treebanks:ro [2012/01/12 17:29] (current) zeman Inside and parsing. |
||
---|---|---|---|
Line 49: | Line 49: | ||
==== Inside ==== | ==== Inside ==== | ||
- | The corpus contains texts from Portugal | + | Sentences have been segmented into clauses |
- | Morphological annotation includes | + | There are part-of-speech tags but no lemmas |
- | + | ||
- | Multi-word expressions have been concatenated into one token, using underscore | + | |
==== Sample ==== | ==== Sample ==== | ||
- | The first two sentences | + | The first sentence |
- | | 1 | Um | um | art | art | <nowiki><arti>|M|S</nowiki> | 2 | <nowiki>>N</nowiki> | <nowiki>_</nowiki> | < | + | <code xml><?xml version=" |
- | | 2 | revivalismo | revivalismo | n | n | <nowiki>M|S</nowiki> | 0 | UTT | <nowiki>_</nowiki> | <nowiki>_</ | + | <!DOCTYPE DGAdoc SYSTEM " |
- | | 3 | refrescante | refrescante | adj | adj | <nowiki>M|S</nowiki> | 2 | <nowiki>N<</nowiki> | < | + | <DGAdoc> |
- | | |||||||||| | + | <s> |
- | | 1 | O | o | art | art | <nowiki><artd>|M|S</nowiki> | 2 | <nowiki>>N</nowiki> | <nowiki>_</nowiki> | <nowiki>_</ | + | <tok> |
- | | 2 | <nowiki>7_e_Meio</nowiki> | <nowiki>7_e_Meio</nowiki> | prop | prop | <nowiki>M|S</nowiki> | 3 | SUBJ | <nowiki>_</ | + | |
- | | 3 | é | ser | v | <nowiki>v-fin</nowiki> | <nowiki>PR|3S|IND</nowiki> | 0 | STA | <nowiki>_</nowiki> | < | + | <ordno>1</ordno> |
- | | 4 | um | um | art | art | <nowiki><arti>|M|S</nowiki> | 5 | < | + | |
- | | 5 | <nowiki>ex-libris</nowiki> | <nowiki>ex-libris</nowiki> | n | n | <nowiki>M|P</nowiki> | 3 | SC | < | + | |
- | | 6 | de | de | prp | prp | <nowiki>< | + | |
- | | 7 | a | o | art | art | <nowiki>< | + | <reltype>atribut adj.</reltype> |
- | | 8 | noite | noite | n | n | < | + | |
- | | 9 | algarvia | algarvio | adj | adj | <nowiki>F|S</nowiki> | 8 | <nowiki>N<</nowiki> | <nowiki>_</nowiki> | < | + | </tok> |
- | | 10 | <nowiki>.</nowiki> | < | + | |
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <reltype>subiect</reltype> | ||
+ | | ||
+ | </tok> | ||
+ | <tok> | ||
+ | | ||
+ | <ordno>3</ordno> | ||
+ | | ||
+ | | ||
+ | <head>2</head> | ||
+ | | ||
+ | | ||
+ | </tok> | ||
+ | <tok> | ||
+ | | ||
+ | <ordno>4</ordno> | ||
+ | | ||
+ | | ||
+ | <head>3</head> | ||
+ | <reltype>rel. poses.</reltype> | ||
+ | </syn> | ||
+ | </tok> | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | <ctag>verb aux.</ctag> | ||
+ | | ||
+ | <head>6</head> | ||
+ | <reltype>rel. aux.</reltype> | ||
+ | </syn> | ||
+ | </tok> | ||
+ | <tok> | ||
+ | | ||
+ | <ordno>6</ordno> | ||
+ | | ||
+ | | ||
+ | <head>7</head> | ||
+ | | ||
+ | | ||
+ | </tok> | ||
+ | </s></code> | ||
- | The first two sentences | + | The first sentence |
- | | 1 | É | é | adv | adv | <nowiki><foc></nowiki> | 9 | FOC | <nowiki>_</nowiki> | <nowiki>_</ | + | <code xml><?xml version=" |
- | | 2 | por | por | prp | prp | <nowiki>_</nowiki> | 9 | ADVL | <nowiki>_</nowiki> | <nowiki>_</nowiki> | | + | <!DOCTYPE DGAdoc SYSTEM " |
- | | 3 | isso | isso | pron | <nowiki>pron-indp</nowiki> | < | + | <DGAdoc> |
- | | 4 | que | que | adv | adv | <nowiki><foc></nowiki> | 9 | FOC | <nowiki>_</nowiki> | <nowiki>_</nowiki> | | + | <s> |
- | | 5 | <nowiki>,</nowiki> | < | + | <tok> |
- | | 6 | explica | explicar | v | <nowiki>v-fin</nowiki> | <nowiki>PR|3S|IND</nowiki> | 0 | STA | <nowiki>_</nowiki> | <nowiki>_</ | + | <orth>Judecatorul</orth> |
- | | 7 | <nowiki>,</nowiki> | <nowiki>,</nowiki> | punc | punc | <nowiki>_</nowiki> | 6 | PUNC | <nowiki>_</nowiki> | < | + | |
- | | 8 | não | não | adv | adv | <nowiki>_</nowiki> | 9 | ADVL | <nowiki>_</nowiki> | <nowiki>_</ | + | |
- | | 9 | tem | ter | v | <nowiki>v-fin</nowiki> | <nowiki>PR|3S|IND</nowiki> | 6 | ACC | <nowiki>_</nowiki> | <nowiki>_</ | + | <syn> |
- | | 10 | pena | pena | n | n | <nowiki>F|S</nowiki> | 9 | ACC | <nowiki>_</nowiki> | <nowiki>_</nowiki> | | + | <head>9</head> |
- | | 11 | de | de | prp | prp | <nowiki>_</nowiki> | 10 | < | + | <reltype>subiect</reltype> |
- | | 12 | <nowiki>Hillary_Clinton</nowiki> | < | + | |
- | | 13 | <nowiki>.</nowiki> | <nowiki>.</nowiki> | punc | punc | <nowiki>_</ | + | </tok> |
- | | |||||||||| | + | |
- | | 1 | <nowiki>«</nowiki> | <nowiki>«</nowiki> | punc | punc | < | + | |
- | | 2 | Eles | ele | pron | <nowiki>pron-pers</nowiki> | <nowiki>M|3P|NOM</nowiki> | 8 | SUBJ | < | + | |
- | | 3 | <nowiki>[</nowiki> | < | + | |
- | | 4 | Hillary | Hillary | prop | prop | <nowiki>F|S</nowiki> | 9 | APP | < | + | <syn> |
- | | 5 | e | e | conj | <nowiki>conj-c</nowiki> | <nowiki><co-app></nowiki> | 4 | CO | <nowiki>_</nowiki> | < | + | <head>1</head> |
- | | 6 | < | + | <reltype>atribut adj.</reltype> |
- | | 7 | <nowiki>]</nowiki> | < | + | |
- | | 8 | podem | poder | v | <nowiki>v-fin</nowiki> | < | + | </tok> |
- | | 9 | ter | ter | v | <nowiki>v-inf</nowiki> | < | + | <tok> |
- | | 10 | alguma | algum | pron | < | + | <orth>IonBriac</orth> |
- | | 11 | espécie | espécie | n | n | < | + | |
- | | 12 | de | de | prp | prp | <nowiki>_</nowiki> | 11 | < | + | |
- | | 13 | acordo | acordo | n | n | <nowiki>M|S</nowiki> | 12 | < | + | |
- | | 14 | e | e | conj | < | + | |
- | | 15 | quem | quem | pron | <nowiki>pron-indp</nowiki> | < | + | <reltype>atribut subst.</reltype> |
- | | 16 | somos | ser | v | < | + | |
- | | 17 | nós | nós | pron | <nowiki>pron-pers</nowiki> | <nowiki>M/F|1P|NOM</nowiki> | 16 | SUBJ | < | + | </tok> |
- | | 18 | para | para | prp | prp | < | + | <tok> |
- | | 19 | dizer | dizer | v | <nowiki>v-inf</nowiki> | <nowiki>_</nowiki> | 18 | <nowiki>P<</nowiki> | <nowiki>_</nowiki> | < | + | |
- | | 20 | se | se | conj | <nowiki>conj-s</nowiki> | < | + | <ordno>4</ordno> |
- | | 21 | é | ser | v | <nowiki>v-fin</nowiki> | <nowiki>PR|3S|IND</nowiki> | 19 | ACC | <nowiki>_</nowiki> | <nowiki>_</ | + | |
- | | 22 | bom | bom | adj | adj | <nowiki>M|S</nowiki> | 21 | SC | <nowiki>_</nowiki> | < | + | |
- | | 23 | ou | ou | conj | < | + | |
- | | 24 | mau | mau | adj | adj | <nowiki>M|S</nowiki> | 22 | CJT | < | + | <reltype>atribut subst.</reltype> |
- | | 25 | < | + | |
+ | </tok> | ||
+ | <tok> | ||
+ | <orth>la</orth> | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>4</head> | ||
+ | <reltype>rel. prepoz.</reltype> | ||
+ | | ||
+ | </tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <head>4</head> | ||
+ | | ||
+ | | ||
+ | </tok> | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | <ctag>substantiv</ctag> | ||
+ | | ||
+ | <head>6</head> | ||
+ | <reltype>atribut subst.</reltype> | ||
+ | | ||
+ | </tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>9</head> | ||
+ | <reltype>rel. aux.</reltype> | ||
+ | | ||
+ | </tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | <ctag>verb</ctag> | ||
+ | | ||
+ | <head>19</head> | ||
+ | <reltype>predicat</reltype> | ||
+ | | ||
+ | | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>9</head> | ||
+ | <reltype>complement circumst.</reltype> | ||
+ | | ||
+ | </tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>9</head> | ||
+ | <reltype>complement dir.</reltype> | ||
+ | | ||
+ | | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <head>11</head> | ||
+ | <reltype>atribut subst.</reltype> | ||
+ | | ||
+ | </tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>12</head> | ||
+ | <reltype>atribut adj.</reltype> | ||
+ | | ||
+ | | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>12</head> | ||
+ | <reltype>atribut subst.</reltype> | ||
+ | | ||
+ | | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | <ctag>substantiv</ctag> | ||
+ | | ||
+ | <head>14</head> | ||
+ | <reltype>rel. poses.</reltype> | ||
+ | </syn> | ||
+ | </tok> | ||
+ | <tok> | ||
+ | <orth>Siderurgic</orth> | ||
+ | | ||
+ | | ||
+ | | ||
+ | <head>15</head> | ||
+ | | ||
+ | | ||
+ | </tok> | ||
+ | <tok> | ||
+ | <orth>din</orth> | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | <reltype>atribut subst.</reltype> | ||
+ | | ||
+ | | ||
+ | <tok> | ||
+ | | ||
+ | | ||
+ | | ||
+ | <syn> | ||
+ | <head>17</head> | ||
+ | <reltype>rel. prepoz.</reltype> | ||
+ | </syn> | ||
+ | </tok> | ||
+ | </s></code> | ||
==== Parsing ==== | ==== Parsing ==== | ||
- | Bosque | + | The corpus |
- | + | ||
- | The results of the CoNLL 2006 shared task are [[http:// | + | |
- | + | ||
- | ^ Parser (Authors) ^ LAS ^ UAS ^ | + | |
- | | MST (McDonald et al.) | 86.82 | 91.36 | | + | |
- | | Malt (Nivre et al.) | 87.60 | 91.22 | | + | |
- | | Nara (Yuchang Cheng) | 85.07 | 90.30 | | + | |
+ | I am not aware of any published evaluation of parsing accuracy on this data. |