You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
410 lines
11 KiB
410 lines
11 KiB
/*************************************************************************/ |
|
/* */ |
|
/* Language Technologies Institute */ |
|
/* Carnegie Mellon University */ |
|
/* Copyright (c) 2001 */ |
|
/* All Rights Reserved. */ |
|
/* */ |
|
/* Permission is hereby granted, free of charge, to use and distribute */ |
|
/* this software and its documentation without restriction, including */ |
|
/* without limitation the rights to use, copy, modify, merge, publish, */ |
|
/* distribute, sublicense, and/or sell copies of this work, and to */ |
|
/* permit persons to whom this work is furnished to do so, subject to */ |
|
/* the following conditions: */ |
|
/* 1. The code must retain the above copyright notice, this list of */ |
|
/* conditions and the following disclaimer. */ |
|
/* 2. Any modifications must be clearly marked as such. */ |
|
/* 3. Original authors' names are not deleted. */ |
|
/* 4. The authors' names are not used to endorse or promote products */ |
|
/* derived from this software without specific prior written */ |
|
/* permission. */ |
|
/* */ |
|
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */ |
|
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ |
|
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */ |
|
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */ |
|
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ |
|
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ |
|
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ |
|
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ |
|
/* THIS SOFTWARE. */ |
|
/* */ |
|
/*************************************************************************/ |
|
/* Author: Alan W Black (awb@cs.cmu.edu) */ |
|
/* Date: January 2001 */ |
|
/*************************************************************************/ |
|
/* */ |
|
/* English text expanders */ |
|
/* */ |
|
/* numbers, digits, ids (years), money */ |
|
/* */ |
|
/*************************************************************************/ |
|
|
|
#include <ctype.h> |
|
#include "us_text.h" |
|
|
|
static const char * const digit2num[] = { |
|
"zero", |
|
"one", |
|
"two", |
|
"three", |
|
"four", |
|
"five", |
|
"six", |
|
"seven", |
|
"eight", |
|
"nine" }; |
|
|
|
static const char * const digit2teen[] = { |
|
"ten", /* shouldn't get called */ |
|
"eleven", |
|
"twelve", |
|
"thirteen", |
|
"fourteen", |
|
"fifteen", |
|
"sixteen", |
|
"seventeen", |
|
"eighteen", |
|
"nineteen" }; |
|
|
|
static const char * const digit2enty[] = { |
|
"zero", /* shouldn't get called */ |
|
"ten", |
|
"twenty", |
|
"thirty", |
|
"forty", |
|
"fifty", |
|
"sixty", |
|
"seventy", |
|
"eighty", |
|
"ninety" }; |
|
|
|
static const char * const ord2num[] = { |
|
"zeroth", |
|
"first", |
|
"second", |
|
"third", |
|
"fourth", |
|
"fifth", |
|
"sixth", |
|
"seventh", |
|
"eighth", |
|
"ninth" }; |
|
|
|
static const char * const ord2teen[] = { |
|
"tenth", /* shouldn't get called */ |
|
"eleventh", |
|
"twelfth", |
|
"thirteenth", |
|
"fourteenth", |
|
"fifteenth", |
|
"sixteenth", |
|
"seventeenth", |
|
"eighteenth", |
|
"nineteenth" }; |
|
|
|
static const char * const ord2enty[] = { |
|
"zeroth", /* shouldn't get called */ |
|
"tenth", |
|
"twentieth", |
|
"thirtieth", |
|
"fortieth", |
|
"fiftieth", |
|
"sixtieth", |
|
"seventieth", |
|
"eightieth", |
|
"ninetieth" }; |
|
|
|
cst_val *en_exp_number(const char *numstring) |
|
{ |
|
/* Expand given token to list of words pronouncing it as a number */ |
|
int num_digits = cst_strlen(numstring); |
|
char part[4]; |
|
cst_val *p; |
|
int i; |
|
|
|
if (num_digits == 0) |
|
return NULL; |
|
else if (num_digits == 1) |
|
return en_exp_digits(numstring); |
|
else if (num_digits == 2) |
|
{ |
|
if (numstring[0] == '0') |
|
{ |
|
if (numstring[1] == '0') |
|
return 0; |
|
else |
|
return cons_val(string_val(digit2num[numstring[1]-'0']),0); |
|
} |
|
else if (numstring[1] == '0') |
|
return cons_val(string_val(digit2enty[numstring[0]-'0']),0); |
|
else if (numstring[0] == '1') |
|
return cons_val(string_val(digit2teen[numstring[1]-'0']),0); |
|
else |
|
return cons_val(string_val(digit2enty[numstring[0]-'0']), |
|
en_exp_digits(numstring+1)); |
|
} |
|
else if (num_digits == 3) |
|
{ |
|
if (numstring[0] == '0') |
|
return en_exp_number(numstring+1); |
|
else |
|
return cons_val(string_val(digit2num[numstring[0]-'0']), |
|
cons_val(string_val("hundred"), |
|
en_exp_number(numstring+1))); |
|
} |
|
else if (num_digits < 7) |
|
{ |
|
for (i=0; i < num_digits-3; i++) |
|
part[i] = numstring[i]; |
|
part[i]='\0'; |
|
p = en_exp_number(part); |
|
if (p == 0) /* no thousands */ |
|
return en_exp_number(numstring+i); |
|
else |
|
return val_append(p,cons_val(string_val("thousand"), |
|
en_exp_number(numstring+i))); |
|
} |
|
else if (num_digits < 10) |
|
{ |
|
for (i=0; i < num_digits-6; i++) |
|
part[i] = numstring[i]; |
|
part[i]='\0'; |
|
p = en_exp_number(part); |
|
if (p == 0) /* no millions */ |
|
return en_exp_number(numstring+i); |
|
else |
|
return val_append(p,cons_val(string_val("million"), |
|
en_exp_number(numstring+i))); |
|
} |
|
else if (num_digits < 13) |
|
{ /* If there are pedantic brits out there, tough!, 10^9 is a billion */ |
|
for (i=0; i < num_digits-9; i++) |
|
part[i] = numstring[i]; |
|
part[i]='\0'; |
|
p = en_exp_number(part); |
|
if (p == 0) /* no billions */ |
|
return en_exp_number(numstring+i); |
|
else |
|
return val_append(p,cons_val(string_val("billion"), |
|
en_exp_number(numstring+i))); |
|
} |
|
else /* Way too many digits here, to be a number */ |
|
{ |
|
return en_exp_digits(numstring); |
|
} |
|
} |
|
|
|
cst_val *en_exp_ordinal(const char *rawnumstring) |
|
{ |
|
/* return ordinal for digit string */ |
|
cst_val *card, *o; |
|
const cst_val *t; |
|
const char *l; |
|
const char *ord; |
|
char *numstring; |
|
int i,j; |
|
|
|
numstring = cst_strdup(rawnumstring); |
|
for (j=i=0; i < cst_strlen(rawnumstring); i++) |
|
if (rawnumstring[i] != ',') |
|
{ |
|
numstring[j] = rawnumstring[i]; |
|
j++; |
|
} |
|
numstring[j] = '\0'; |
|
card = val_reverse(en_exp_number(numstring)); |
|
cst_free(numstring); |
|
|
|
l = val_string(val_car(card)); |
|
ord = 0; |
|
for (i=0; i<10; i++) |
|
if (cst_streq(l,digit2num[i])) |
|
ord = ord2num[i]; |
|
if (!ord) |
|
for (i=0; i<10; i++) |
|
if (cst_streq(l,digit2teen[i])) |
|
ord = ord2teen[i]; |
|
if (!ord) |
|
for (i=0; i<10; i++) |
|
if (cst_streq(l,digit2enty[i])) |
|
ord = ord2enty[i]; |
|
if (cst_streq(l,"hundred")) |
|
ord = "hundredth"; |
|
if (cst_streq(l,"thousand")) |
|
ord = "thousandth"; |
|
if (cst_streq(l,"billion")) |
|
ord = "billtionth"; |
|
if (!ord) /* dunno, so don't convert anything */ |
|
return card; |
|
o = cons_val(string_val(ord),0); |
|
for (t=val_cdr(card); t; t=val_cdr(t)) |
|
o = cons_val(val_car(t),o); |
|
delete_val(card); |
|
return o; |
|
} |
|
|
|
cst_val *en_exp_id(const char *numstring) |
|
{ |
|
/* Expand numstring as pairs as in years or ids */ |
|
char aaa[3]; |
|
|
|
if ((cst_strlen(numstring) == 4) && |
|
(numstring[2] == '0') && |
|
(numstring[3] == '0')) |
|
{ |
|
if (numstring[1] == '0') |
|
return en_exp_number(numstring); /* 2000, 3000 */ |
|
else |
|
{ |
|
aaa[0] = numstring[0]; |
|
aaa[1] = numstring[1]; |
|
aaa[2] = '\0'; |
|
return val_append(en_exp_number(aaa), |
|
cons_val(string_val("hundred"),0)); |
|
} |
|
} |
|
else if ((cst_strlen(numstring) == 3) && |
|
(numstring[0] != '0') && |
|
(numstring[1] == '0') && |
|
(numstring[2] == '0')) |
|
{ |
|
return cons_val(string_val(digit2num[numstring[0]-'0']), |
|
cons_val(string_val("hundred"),0)); |
|
} |
|
else if ((cst_strlen(numstring) == 2) && (numstring[0] == '0') |
|
&& (numstring[1] == '0')) |
|
return cons_val(string_val("zero"), |
|
cons_val(string_val("zero"),NULL)); |
|
else if ((cst_strlen(numstring) == 2) && (numstring[0] == '0')) |
|
return cons_val(string_val("oh"), |
|
en_exp_digits(&numstring[1])); |
|
else if (((cst_strlen(numstring) == 4) && |
|
((numstring[1] == '0'))) || |
|
(cst_strlen(numstring) < 3)) |
|
return en_exp_number(numstring); |
|
else if (cst_strlen(numstring)%2 == 1) |
|
{ |
|
return cons_val(string_val(digit2num[numstring[0]-'0']), |
|
en_exp_id(&numstring[1])); |
|
} |
|
else |
|
{ |
|
aaa[0] = numstring[0]; |
|
aaa[1] = numstring[1]; |
|
aaa[2] = '\0'; |
|
return val_append(en_exp_number(aaa),en_exp_id(&numstring[2])); |
|
} |
|
} |
|
|
|
cst_val *en_exp_real(const char *numstring) |
|
{ |
|
char *aaa, *p; |
|
cst_val *r; |
|
|
|
if (numstring && (numstring[0] == '-')) |
|
r = cons_val(string_val("minus"), |
|
en_exp_real(&numstring[1])); |
|
else if (numstring && (numstring[0] == '+')) |
|
r = cons_val(string_val("plus"), |
|
en_exp_real(&numstring[1])); |
|
else if (((p=strchr(numstring,'e')) != 0) || |
|
((p=strchr(numstring,'E')) != 0)) |
|
{ |
|
aaa = cst_strdup(numstring); |
|
aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0'; |
|
r = val_append(en_exp_real(aaa), |
|
cons_val(string_val("e"), |
|
en_exp_real(p+1))); |
|
cst_free(aaa); |
|
} |
|
else if ((p=strchr(numstring,'.')) != 0) |
|
{ |
|
aaa = cst_strdup(numstring); |
|
aaa[cst_strlen(numstring)-cst_strlen(p)] = '\0'; |
|
r = val_append(en_exp_number(aaa), |
|
cons_val(string_val("point"), |
|
en_exp_digits(p+1))); |
|
cst_free(aaa); |
|
} |
|
else |
|
r = en_exp_number(numstring); /* I don't think you can get here */ |
|
|
|
return r; |
|
} |
|
|
|
cst_val *en_exp_digits(const char *numstring) |
|
{ |
|
/* Expand given token to list of words pronouncing it as digits */ |
|
cst_val *d = 0; |
|
const char *p; |
|
|
|
for (p=numstring; *p; p++) |
|
{ |
|
if ((*p >= '0') && (*p <= '9')) |
|
d = cons_val(string_val(digit2num[*p-'0']),d); |
|
else |
|
d = cons_val(string_val("umpty"),d); |
|
} |
|
|
|
return val_reverse(d); |
|
} |
|
|
|
cst_val *en_exp_letters(const char *lets) |
|
{ |
|
/* returns these as list of single char symbols */ |
|
char *aaa; |
|
cst_val *r; |
|
int i; |
|
|
|
aaa = cst_alloc(char,2); |
|
aaa[1] = '\0'; |
|
for (r=0,i=0; lets[i] != '\0'; i++) |
|
{ |
|
aaa[0] = lets[i]; |
|
if (isupper((int)aaa[0])) |
|
aaa[0] = tolower((int)aaa[0]); |
|
if (strchr("0123456789",aaa[0])) |
|
r = cons_val(string_val(digit2num[aaa[0]-'0']),r); |
|
else if (cst_streq(aaa,"a")) |
|
r = cons_val(string_val("_a"),r); |
|
else |
|
r = cons_val(string_val(aaa),r); |
|
} |
|
cst_free(aaa); |
|
|
|
return val_reverse(r); |
|
} |
|
|
|
int en_exp_roman(const char *roman) |
|
{ |
|
int val; |
|
const char *p; |
|
val = 0; |
|
|
|
for (p=roman; *p != 0; p++) |
|
{ |
|
if (*p == 'X') |
|
val += 10; |
|
else if (*p == 'V') |
|
val += 5; |
|
else if (*p == 'I') |
|
{ |
|
if (p[1] == 'V') |
|
{ |
|
val += 4; |
|
p++; |
|
} |
|
else if (p[1] == 'X') |
|
{ |
|
val += 9; |
|
p++; |
|
} |
|
else |
|
val += 1; |
|
} |
|
} |
|
return val; |
|
} |
|
|
|
|
|
|