Re: OT: CXS chart and machine-readable Unicode->CXS mappings
| From: | Mark J. Reed <markjreed@...> | 
|---|
| Date: | Tuesday, March 9, 2004, 16:41 | 
|---|
On Mon, Mar 08, 2004 at 11:13:42PM -0500, Mark J. Reed wrote:
> On Mon, Mar 08, 2004 at 09:18:25PM -0600, Herman Miller wrote:
> > >I composed the Conlang X-Sampa (CXS) chart in a form that is readable
> > >from Lisp, Perl and C/C++.
I've wrapped a module around the Perl so that you can simply do
        use CXS;
And then, if you have a UTF-8-encoded IPA transcription in $ipa, you can
get the CXS equivalent via
        my $cxs = ipa2cxs($ipa);
or vice-versa:
        my $ipa = cxs2ipa($cxs);
The functions are naïve and won't do any reordering or anything like
that (such as turning ab) into a b with a tie over them), but it works
fairly well even so. At least cxs2ipa is smart enough to use the longest
matching sequence.
I tried attaching the .pm, but the listserv rejected it, so I'm just
going to include the text here.
-Mark
8<-------------------------------CUT HERE------------------------------------
package CXS;
use base Exporter;
our @EXPORT = qw(ipa2cxs cxs2ipa);
my %charmap_cxs=
(
    0x00E6 => "&",
    0x00E7 => "C",
    0x00F0 => "D",
    0x00F8 => "2",
    0x0127 => "X\\",
    0x014B => "N",
    0x0153 => "9",
    0x0180 => "B",
    0x01A5 => "p_<",
    0x01AB => "t_j",
    0x01AD => "t_<",
    0x01BB => "dz)",
    0x01C0 => "|\\",
    0x01C1 => "|\\|\\",
    0x01C2 => "=\\",
    0x01C3 => "!\\",
    0x0250 => "6",
    0x0251 => "A",
    0x0252 => "Q",
    0x0253 => "b_<",
    0x0254 => "O",
    0x0255 => "s\\",
    0x0256 => "d`",
    0x0257 => "d_<",
    0x0258 => "@\\",
    0x0259 => "@",
    0x025A => "@`",
    0x025B => "E",
    0x025C => "3",
    0x025D => "3`",
    0x025E => "3\\",
    0x025F => "J\\",
    0x0260 => "g_<",
    0x0261 => "g",
    0x0262 => "G\\",
    0x0263 => "G",
    0x0264 => "7",
    0x0265 => "H",
    0x0266 => "h\\",
    0x0267 => "x\\",
    0x0268 => "1",
    0x0269 => "I",
    0x026A => "I",
    0x026B => "5",
    0x026C => "K",
    0x026D => "l`",
    0x026E => "K\\",
    0x026F => "M",
    0x0270 => "M\\",
    0x0271 => "F",
    0x0272 => "J",
    0x0273 => "n`",
    0x0274 => "N\\",
    0x0275 => "8",
    0x0276 => "&\\",
    0x0277 => "U",
    0x0278 => "p\\",
    0x0279 => "r\\",
    0x027A => "l\\",
    0x027B => "r\\`",
    0x027C => "r\\_r",
    0x027D => "r`",
    0x027E => "4",
    0x027F => "z=",
    0x0280 => "R\\",
    0x0281 => "R",
    0x0282 => "s`",
    0x0283 => "S",
    0x0284 => "J\\_<",
    0x0285 => "z`=",
    0x0286 => "S_j",
    0x0287 => "|\\",
    0x0288 => "t`",
    0x0289 => "u\\",
    0x028A => "U",
    0x028B => "P",
    0x028C => "V",
    0x028D => "W",
    0x028E => "L",
    0x028F => "Y",
    0x0290 => "z`",
    0x0291 => "z\\",
    0x0292 => "Z",
    0x0293 => "Z_j",
    0x0294 => "?",
    0x0295 => "?\\",
    0x0296 => "|\\|\\",
    0x0297 => "!\\",
    0x0298 => "O\\",
    0x0299 => "B\\",
    0x029A => "&\\",
    0x029B => "G\\_<",
    0x029C => "H\\",
    0x029D => "j\\",
    0x029F => "L\\",
    0x02A0 => "q_<",
    0x02A1 => ">\\",
    0x02A2 => "<\\",
    0x02A3 => "dz)",
    0x02A4 => "dZ)",
    0x02A5 => "dz\\)",
    0x02A6 => "ts)",
    0x02A7 => "tS)",
    0x02A8 => "ts\\)",
    0x02A9 => "fN)",
    0x02AA => "ls)",
    0x02AB => "lz)",
    0x02AC => "._w_w",
    0x02AD => "._d_d",
    0x02B0 => "_h",
    0x02B1 => "_t",
    0x02B2 => "_j",
    0x02B7 => "_w",
    0x02B8 => "_j",
    0x02BC => "_>",
    0x02C0 => "_>",
    0x02C7 => "_F_R",
    0x02C8 => "'",
    0x02C9 => "_T",
    0x02CC => "\"",
    0x02CD => "_L",
    0x02CE => "_L_B",
    0x02CF => "_B_L",
    0x02D0 => ":",
    0x02D1 => ":\\",
    0x02D2 => "_O",
    0x02D3 => "_c",
    0x02D4 => "_r",
    0x02D5 => "_o",
    0x02D6 => "_+",
    0x02D7 => "_-",
    0x02D8 => "_X",
    0x02DA => "_0",
    0x02DC => "~",
    0x02DD => "_T",
    0x02DE => "`",
    0x02E0 => "_G",
    0x02E1 => "_l",
    0x02E4 => "_?\\",
    0x02E5 => "_T",
    0x02E6 => "_H",
    0x02E7 => "_M",
    0x02E8 => "_L",
    0x02E9 => "_B",
    0x02EC => "_v",
    0x0300 => "_L",
    0x0301 => "_H",
    0x0302 => "_F",
    0x0303 => "~",
    0x0304 => "_M",
    0x0306 => "_X",
    0x0308 => "_\"",
    0x030A => "_0",
    0x030B => "_T",
    0x030C => "_R",
    0x030D => "=",
    0x030F => "_B",
    0x0318 => "_A",
    0x0319 => "_q",
    0x031A => "_}",
    0x031C => "_c",
    0x031D => "_r",
    0x031E => "_o",
    0x031F => "_+",
    0x0320 => "_-",
    0x0321 => "_j",
    0x0322 => "`",
    0x0324 => "_t",
    0x0325 => "_0",
    0x0329 => "=",
    0x032A => "_d",
    0x032B => "_w",
    0x032C => "_v",
    0x032F => "_^",
    0x0330 => "_k",
    0x0334 => "_e",
    0x0339 => "_O",
    0x033A => "_a",
    0x033B => "_m",
    0x033C => "_N",
    0x033D => "_x",
    0x0361 => "_",
    0x03B2 => "B",
    0x03B8 => "T",
    0x03C7 => "X",
    0x2016 => "||",
    0x203F => "-\\",
    0x207F => "_n",
    0x2191 => "^",
    0x2193 => "!",
    0x2197 => "<R>",
    0x2198 => "<F>",
);
my %charmap_ipa = reverse %charmap_cxs;
my @cxs_sequences = sort { length($b) <=> length($a) } keys %charmap_ipa;
sub ipa2cxs
{
    my $ipa = shift;
    my $cxs = '';
    for (unpack('U*', $ipa))
    {
        $cxs .= $charmap_cxs{$_} || $_;
    }
    return $cxs;
}
sub cxs2ipa
{
    my $cxs = shift;
    my $ipa = '';
    while (my $l = length($cxs))
    {
        for (@cxs_sequences)
        {
            if ($cxs =~ s/^\Q$_\E//)
            {
                $ipa .= chr($charmap_ipa{$_});
            }
        }
        if ($l == length($cxs))
        {
            $cxs =~ s/^(.)//;
            $ipa .= $1;
        }
    }
    return $ipa;
}
1;
Reply