Re: OT: CXS chart and machine-readable Unicode->CXS mappings
From: | Mark J. Reed <markjreed@...> |
Date: | Tuesday, March 9, 2004, 16:41 |
On Mon, Mar 08, 2004 at 11:13:42PM -0500, Mark J. Reed wrote:
> On Mon, Mar 08, 2004 at 09:18:25PM -0600, Herman Miller wrote:
> > >I composed the Conlang X-Sampa (CXS) chart in a form that is readable
> > >from Lisp, Perl and C/C++.
I've wrapped a module around the Perl so that you can simply do
use CXS;
And then, if you have a UTF-8-encoded IPA transcription in $ipa, you can
get the CXS equivalent via
my $cxs = ipa2cxs($ipa);
or vice-versa:
my $ipa = cxs2ipa($cxs);
The functions are naïve and won't do any reordering or anything like
that (such as turning ab) into a b with a tie over them), but it works
fairly well even so. At least cxs2ipa is smart enough to use the longest
matching sequence.
I tried attaching the .pm, but the listserv rejected it, so I'm just
going to include the text here.
-Mark
8<-------------------------------CUT HERE------------------------------------
package CXS;
use base Exporter;
our @EXPORT = qw(ipa2cxs cxs2ipa);
my %charmap_cxs=
(
0x00E6 => "&",
0x00E7 => "C",
0x00F0 => "D",
0x00F8 => "2",
0x0127 => "X\\",
0x014B => "N",
0x0153 => "9",
0x0180 => "B",
0x01A5 => "p_<",
0x01AB => "t_j",
0x01AD => "t_<",
0x01BB => "dz)",
0x01C0 => "|\\",
0x01C1 => "|\\|\\",
0x01C2 => "=\\",
0x01C3 => "!\\",
0x0250 => "6",
0x0251 => "A",
0x0252 => "Q",
0x0253 => "b_<",
0x0254 => "O",
0x0255 => "s\\",
0x0256 => "d`",
0x0257 => "d_<",
0x0258 => "@\\",
0x0259 => "@",
0x025A => "@`",
0x025B => "E",
0x025C => "3",
0x025D => "3`",
0x025E => "3\\",
0x025F => "J\\",
0x0260 => "g_<",
0x0261 => "g",
0x0262 => "G\\",
0x0263 => "G",
0x0264 => "7",
0x0265 => "H",
0x0266 => "h\\",
0x0267 => "x\\",
0x0268 => "1",
0x0269 => "I",
0x026A => "I",
0x026B => "5",
0x026C => "K",
0x026D => "l`",
0x026E => "K\\",
0x026F => "M",
0x0270 => "M\\",
0x0271 => "F",
0x0272 => "J",
0x0273 => "n`",
0x0274 => "N\\",
0x0275 => "8",
0x0276 => "&\\",
0x0277 => "U",
0x0278 => "p\\",
0x0279 => "r\\",
0x027A => "l\\",
0x027B => "r\\`",
0x027C => "r\\_r",
0x027D => "r`",
0x027E => "4",
0x027F => "z=",
0x0280 => "R\\",
0x0281 => "R",
0x0282 => "s`",
0x0283 => "S",
0x0284 => "J\\_<",
0x0285 => "z`=",
0x0286 => "S_j",
0x0287 => "|\\",
0x0288 => "t`",
0x0289 => "u\\",
0x028A => "U",
0x028B => "P",
0x028C => "V",
0x028D => "W",
0x028E => "L",
0x028F => "Y",
0x0290 => "z`",
0x0291 => "z\\",
0x0292 => "Z",
0x0293 => "Z_j",
0x0294 => "?",
0x0295 => "?\\",
0x0296 => "|\\|\\",
0x0297 => "!\\",
0x0298 => "O\\",
0x0299 => "B\\",
0x029A => "&\\",
0x029B => "G\\_<",
0x029C => "H\\",
0x029D => "j\\",
0x029F => "L\\",
0x02A0 => "q_<",
0x02A1 => ">\\",
0x02A2 => "<\\",
0x02A3 => "dz)",
0x02A4 => "dZ)",
0x02A5 => "dz\\)",
0x02A6 => "ts)",
0x02A7 => "tS)",
0x02A8 => "ts\\)",
0x02A9 => "fN)",
0x02AA => "ls)",
0x02AB => "lz)",
0x02AC => "._w_w",
0x02AD => "._d_d",
0x02B0 => "_h",
0x02B1 => "_t",
0x02B2 => "_j",
0x02B7 => "_w",
0x02B8 => "_j",
0x02BC => "_>",
0x02C0 => "_>",
0x02C7 => "_F_R",
0x02C8 => "'",
0x02C9 => "_T",
0x02CC => "\"",
0x02CD => "_L",
0x02CE => "_L_B",
0x02CF => "_B_L",
0x02D0 => ":",
0x02D1 => ":\\",
0x02D2 => "_O",
0x02D3 => "_c",
0x02D4 => "_r",
0x02D5 => "_o",
0x02D6 => "_+",
0x02D7 => "_-",
0x02D8 => "_X",
0x02DA => "_0",
0x02DC => "~",
0x02DD => "_T",
0x02DE => "`",
0x02E0 => "_G",
0x02E1 => "_l",
0x02E4 => "_?\\",
0x02E5 => "_T",
0x02E6 => "_H",
0x02E7 => "_M",
0x02E8 => "_L",
0x02E9 => "_B",
0x02EC => "_v",
0x0300 => "_L",
0x0301 => "_H",
0x0302 => "_F",
0x0303 => "~",
0x0304 => "_M",
0x0306 => "_X",
0x0308 => "_\"",
0x030A => "_0",
0x030B => "_T",
0x030C => "_R",
0x030D => "=",
0x030F => "_B",
0x0318 => "_A",
0x0319 => "_q",
0x031A => "_}",
0x031C => "_c",
0x031D => "_r",
0x031E => "_o",
0x031F => "_+",
0x0320 => "_-",
0x0321 => "_j",
0x0322 => "`",
0x0324 => "_t",
0x0325 => "_0",
0x0329 => "=",
0x032A => "_d",
0x032B => "_w",
0x032C => "_v",
0x032F => "_^",
0x0330 => "_k",
0x0334 => "_e",
0x0339 => "_O",
0x033A => "_a",
0x033B => "_m",
0x033C => "_N",
0x033D => "_x",
0x0361 => "_",
0x03B2 => "B",
0x03B8 => "T",
0x03C7 => "X",
0x2016 => "||",
0x203F => "-\\",
0x207F => "_n",
0x2191 => "^",
0x2193 => "!",
0x2197 => "<R>",
0x2198 => "<F>",
);
my %charmap_ipa = reverse %charmap_cxs;
my @cxs_sequences = sort { length($b) <=> length($a) } keys %charmap_ipa;
sub ipa2cxs
{
my $ipa = shift;
my $cxs = '';
for (unpack('U*', $ipa))
{
$cxs .= $charmap_cxs{$_} || $_;
}
return $cxs;
}
sub cxs2ipa
{
my $cxs = shift;
my $ipa = '';
while (my $l = length($cxs))
{
for (@cxs_sequences)
{
if ($cxs =~ s/^\Q$_\E//)
{
$ipa .= chr($charmap_ipa{$_});
}
}
if ($l == length($cxs))
{
$cxs =~ s/^(.)//;
$ipa .= $1;
}
}
return $ipa;
}
1;
Reply