Theiling Online    Sitemap    Conlang Mailing List HQ   

Re: OT: CXS chart and machine-readable Unicode->CXS mappings

From:Mark J. Reed <markjreed@...>
Date:Tuesday, March 9, 2004, 16:41
On Mon, Mar 08, 2004 at 11:13:42PM -0500, Mark J. Reed wrote:
> On Mon, Mar 08, 2004 at 09:18:25PM -0600, Herman Miller wrote: > > >I composed the Conlang X-Sampa (CXS) chart in a form that is readable > > >from Lisp, Perl and C/C++.
I've wrapped a module around the Perl so that you can simply do use CXS; And then, if you have a UTF-8-encoded IPA transcription in $ipa, you can get the CXS equivalent via my $cxs = ipa2cxs($ipa); or vice-versa: my $ipa = cxs2ipa($cxs); The functions are naïve and won't do any reordering or anything like that (such as turning ab) into a b with a tie over them), but it works fairly well even so. At least cxs2ipa is smart enough to use the longest matching sequence. I tried attaching the .pm, but the listserv rejected it, so I'm just going to include the text here. -Mark 8<-------------------------------CUT HERE------------------------------------ package CXS; use base Exporter; our @EXPORT = qw(ipa2cxs cxs2ipa); my %charmap_cxs= ( 0x00E6 => "&", 0x00E7 => "C", 0x00F0 => "D", 0x00F8 => "2", 0x0127 => "X\\", 0x014B => "N", 0x0153 => "9", 0x0180 => "B", 0x01A5 => "p_<", 0x01AB => "t_j", 0x01AD => "t_<", 0x01BB => "dz)", 0x01C0 => "|\\", 0x01C1 => "|\\|\\", 0x01C2 => "=\\", 0x01C3 => "!\\", 0x0250 => "6", 0x0251 => "A", 0x0252 => "Q", 0x0253 => "b_<", 0x0254 => "O", 0x0255 => "s\\", 0x0256 => "d`", 0x0257 => "d_<", 0x0258 => "@\\", 0x0259 => "@", 0x025A => "@`", 0x025B => "E", 0x025C => "3", 0x025D => "3`", 0x025E => "3\\", 0x025F => "J\\", 0x0260 => "g_<", 0x0261 => "g", 0x0262 => "G\\", 0x0263 => "G", 0x0264 => "7", 0x0265 => "H", 0x0266 => "h\\", 0x0267 => "x\\", 0x0268 => "1", 0x0269 => "I", 0x026A => "I", 0x026B => "5", 0x026C => "K", 0x026D => "l`", 0x026E => "K\\", 0x026F => "M", 0x0270 => "M\\", 0x0271 => "F", 0x0272 => "J", 0x0273 => "n`", 0x0274 => "N\\", 0x0275 => "8", 0x0276 => "&\\", 0x0277 => "U", 0x0278 => "p\\", 0x0279 => "r\\", 0x027A => "l\\", 0x027B => "r\\`", 0x027C => "r\\_r", 0x027D => "r`", 0x027E => "4", 0x027F => "z=", 0x0280 => "R\\", 0x0281 => "R", 0x0282 => "s`", 0x0283 => "S", 0x0284 => "J\\_<", 0x0285 => "z`=", 0x0286 => "S_j", 0x0287 => "|\\", 0x0288 => "t`", 0x0289 => "u\\", 0x028A => "U", 0x028B => "P", 0x028C => "V", 0x028D => "W", 0x028E => "L", 0x028F => "Y", 0x0290 => "z`", 0x0291 => "z\\", 0x0292 => "Z", 0x0293 => "Z_j", 0x0294 => "?", 0x0295 => "?\\", 0x0296 => "|\\|\\", 0x0297 => "!\\", 0x0298 => "O\\", 0x0299 => "B\\", 0x029A => "&\\", 0x029B => "G\\_<", 0x029C => "H\\", 0x029D => "j\\", 0x029F => "L\\", 0x02A0 => "q_<", 0x02A1 => ">\\", 0x02A2 => "<\\", 0x02A3 => "dz)", 0x02A4 => "dZ)", 0x02A5 => "dz\\)", 0x02A6 => "ts)", 0x02A7 => "tS)", 0x02A8 => "ts\\)", 0x02A9 => "fN)", 0x02AA => "ls)", 0x02AB => "lz)", 0x02AC => "._w_w", 0x02AD => "._d_d", 0x02B0 => "_h", 0x02B1 => "_t", 0x02B2 => "_j", 0x02B7 => "_w", 0x02B8 => "_j", 0x02BC => "_>", 0x02C0 => "_>", 0x02C7 => "_F_R", 0x02C8 => "'", 0x02C9 => "_T", 0x02CC => "\"", 0x02CD => "_L", 0x02CE => "_L_B", 0x02CF => "_B_L", 0x02D0 => ":", 0x02D1 => ":\\", 0x02D2 => "_O", 0x02D3 => "_c", 0x02D4 => "_r", 0x02D5 => "_o", 0x02D6 => "_+", 0x02D7 => "_-", 0x02D8 => "_X", 0x02DA => "_0", 0x02DC => "~", 0x02DD => "_T", 0x02DE => "`", 0x02E0 => "_G", 0x02E1 => "_l", 0x02E4 => "_?\\", 0x02E5 => "_T", 0x02E6 => "_H", 0x02E7 => "_M", 0x02E8 => "_L", 0x02E9 => "_B", 0x02EC => "_v", 0x0300 => "_L", 0x0301 => "_H", 0x0302 => "_F", 0x0303 => "~", 0x0304 => "_M", 0x0306 => "_X", 0x0308 => "_\"", 0x030A => "_0", 0x030B => "_T", 0x030C => "_R", 0x030D => "=", 0x030F => "_B", 0x0318 => "_A", 0x0319 => "_q", 0x031A => "_}", 0x031C => "_c", 0x031D => "_r", 0x031E => "_o", 0x031F => "_+", 0x0320 => "_-", 0x0321 => "_j", 0x0322 => "`", 0x0324 => "_t", 0x0325 => "_0", 0x0329 => "=", 0x032A => "_d", 0x032B => "_w", 0x032C => "_v", 0x032F => "_^", 0x0330 => "_k", 0x0334 => "_e", 0x0339 => "_O", 0x033A => "_a", 0x033B => "_m", 0x033C => "_N", 0x033D => "_x", 0x0361 => "_", 0x03B2 => "B", 0x03B8 => "T", 0x03C7 => "X", 0x2016 => "||", 0x203F => "-\\", 0x207F => "_n", 0x2191 => "^", 0x2193 => "!", 0x2197 => "<R>", 0x2198 => "<F>", ); my %charmap_ipa = reverse %charmap_cxs; my @cxs_sequences = sort { length($b) <=> length($a) } keys %charmap_ipa; sub ipa2cxs { my $ipa = shift; my $cxs = ''; for (unpack('U*', $ipa)) { $cxs .= $charmap_cxs{$_} || $_; } return $cxs; } sub cxs2ipa { my $cxs = shift; my $ipa = ''; while (my $l = length($cxs)) { for (@cxs_sequences) { if ($cxs =~ s/^\Q$_\E//) { $ipa .= chr($charmap_ipa{$_}); } } if ($l == length($cxs)) { $cxs =~ s/^(.)//; $ipa .= $1; } } return $ipa; } 1;


Henrik Theiling <theiling@...>