[ Zurück ] [ Zurück (Seitenende) ] [ Seitenende ] [ Überkapitel ] [ Bitte Skript-Fehler melden ]
Covingtons Tokenizer in XFST ▸▸▸
define AlphaNumeric [A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|a|b|c|d|e
|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|%0|1|2|3|4|5|6|7|8|9] ; define WS ["\n"|" "|"\t"] ; define WORD [ AlphaNumeric + ] ; define SYMBOL [ \ Alphanumeric ] ; define TOKEN [ WORD|SYMBOL ] ; define Downcase [A->a, B->b, C->c, D->d, E->e, F->f, G->g, H->h, I->i, J->j, K->k, L->l, M->m, N->n, O->o, P->p, Q->q, R->r, S->s, T->t, U->u, V->v, W->w, X->x, Y->y, Z->z] ; define NormalizeSpace [ WS+ @-> "\n"] ; define Tokenizer [ [ TOKEN @-> ... "\n" ] .o. NormalizeSpace .o. Downcase ]; |
Tool tokenize vs. lookup
Schillers Tokenizer mit Abkürzungsbehandlung ▸▸▸
define WS [" "|"\t"|"\n" ];
define SYMBOL [ "\""|"."|","|";"|{’’}|{...} ] ; ! noch mehr define WORD [ \ [ WS | SYMBOL ] ] + ; define ABBR [ {A.} | {Mr.} | {Mrs.} | {E.g.} ] ; ! noch mehr define DIGIT [ "0"|1|2|3|4|5|6|7|8|9 ] ; define NUMSYMBOL [ "-"|"."|"," ] ; define NUM [ [ DIGIT + ] / NUMSYMBOL ] ; ! /=Ignoring-Op define TOKEN [ SYMBOL | WORD | ABBR | NUM ] ; define Tokenizer [ [ TOKEN @-> ... "\n" ] .o. [ WS+ @-> "\n" ]]; read regex Tokenizer.i ; save stack tok2.fst system echo ’E.g. 120.000 New Yorker live in New York.’| tokenize tok2.fst |
Schillers Tokenizer mit naiver Multiword-Behandlung ▸▸▸
source tok2.xfst
define MWT [ {New York} | {Ad hoc} ] ; define TOKEN [ TOKEN | MWT ] ; define WS1 [ WS+ & [ $ "\n" ]] ; define Tokenizer [ [ TOKEN @-> ... "\n"] .o. [ WS1 @-> "\n" ] ]; read regex Tokenizer.i ; save stack tok3.fst system echo ’E.g. 120.000 New Yorker live in New York.’ | tokenize tok3.fst |
Frage
Schillers Tokenizer mit Multiword-Behandlung ▸▸▸
source tok2.xfst
define MWT [ {New York} | {Ad hoc} | {to and fro}] ; define TOKEN [ TOKEN| " " MWT " "] ; ! WS-Markierung bei MWT define BOUND [ SYMBOL | WS | .#. ] ; define WS1 [ WS+ & [ $ "\n" ]] ; define Tokenizer [ [WS+ @-> " "] .o. [ MWT @-> " " ... " " || BOUND _ BOUND ] .o. [TOKEN @-> ... "\n" ] .o. [ WS1 @-> "\n" ] ] ; read regex Tokenizer.i ; save stack tok4.fst system echo ’E.g. 120.000 New Yorker live in New York.’ | tokenize tok4.fst |
[ Zurück ] [ Zurück (Seitenende) ] [ Seitenbeginn ] [ Überkapitel ] [ Bitte Skript-Fehler melden ]