<?php namespace Spondonit\Arabic\Arabic;
/**
* ----------------------------------------------------------------------
*
* Copyright (c) 2006-2013 Khaled Al-Sham'aa.
*
* http://www.ar-php.org
*
* PHP Version 5
*
* ----------------------------------------------------------------------
*
* LICENSE
*
* This program is open source product; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License (LGPL)
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/lgpl.txt>.
*
* ----------------------------------------------------------------------
*
* Class Name: English-Arabic Transliteration
*
* Filename: Transliteration.php
*
* Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org>
*
* Purpose: Transliterate English words into Arabic by render them
* in the orthography of the Arabic language and vise versa
*
* ----------------------------------------------------------------------
*
* English-Arabic Transliteration
*
* PHP class transliterate English words into Arabic by render them in the
* orthography of the Arabic language and vise versa.
*
* Out of vocabulary (OOV) words are a common source of errors in cross language
* information retrieval. Bilingual dictionaries are often limited in their coverage
* of named- entities, numbers, technical terms and acronyms. There is a need to
* generate translations for these "on-the-fly" or at query time.
*
* A significant proportion of OOV words are named entities and technical terms.
* Typical analyses find around 50% of OOV words to be named entities. Yet these
* can be the most important words in the queries. Cross language retrieval
* performance (average precision) reduced more than 50% when named entities in the
* queries were not translated.
*
* When the query language and the document language share the same alphabet it may
* be sufficient to use the OOV word as its own translation. However, when the two
* languages have different alphabets, the query term must somehow be rendered in
* the orthography of the other language. The process of converting a word from one
* orthography into another is called transliteration.
*
* Foreign words often occur in Arabic text as transliteration. This is the case for
* many categories of foreign words, not just proper names but also technical terms
* such as caviar, telephone and internet.
*
* Example:
* <code>
* include('./I18N/Arabic.php');
* $obj = new I18N_Arabic('Transliteration');
*
* $ar_word_1 = $obj->en2ar($en_word_1);
* $en_word_2 = $obj->ar2en($ar_word_2);
* </code>
*
* @category I18N
* @package I18N_Arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
* @copyright 2006-2013 Khaled Al-Sham'aa
*
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
*/
// New in PHP V5.3: Namespaces
// namespace I18N\Arabic;
//
// $obj = new I18N\Arabic\Transliteration();
//
// use I18N\Arabic;
// $obj = new Arabic\Transliteration();
//
// use I18N\Arabic\Transliteration as Transliteration;
// $obj = new Transliteration();
/**
* This PHP class transliterate English words into Arabic
*
* @category I18N
* @package I18N_Arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
* @copyright 2006-2013 Khaled Al-Sham'aa
*
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
*/
class I18N_Arabic_Transliteration
{
private static $_arFinePatterns = array("/'+/u", "/([\- ])'/u", '/(.)#/u');
private static $_arFineReplacements = array("'", '\\1', "\\1'\\1");
private static $_en2arPregSearch = array();
private static $_en2arPregReplace = array();
private static $_en2arStrSearch = array();
private static $_en2arStrReplace = array();
private static $_ar2enPregSearch = array();
private static $_ar2enPregReplace = array();
private static $_ar2enStrSearch = array();
private static $_ar2enStrReplace = array();
private static $_diariticalSearch = array();
private static $_diariticalReplace = array();
private static $_iso233Search = array();
private static $_iso233Replace = array();
private static $_rjgcSearch = array();
private static $_rjgcReplace = array();
private static $_sesSearch = array();
private static $_sesReplace = array();
/**
* Loads initialize values
*
* @ignore
*/
public function __construct()
{
$xml = simplexml_load_file(dirname(__FILE__).'/data/Transliteration.xml');
foreach ($xml->xpath("//preg_replace[@function='ar2en']/pair") as $pair) {
array_push(self::$_ar2enPregSearch, (string)$pair->search);
array_push(self::$_ar2enPregReplace, (string)$pair->replace);
}
foreach (
$xml->xpath("//str_replace[@function='diaritical']/pair") as $pair
) {
array_push(self::$_diariticalSearch, (string)$pair->search);
array_push(self::$_diariticalReplace, (string)$pair->replace);
}
foreach ($xml->xpath("//str_replace[@function='ISO233']/pair") as $pair) {
array_push(self::$_iso233Search, (string)$pair->search);
array_push(self::$_iso233Replace, (string)$pair->replace);
}
foreach ($xml->xpath("//str_replace[@function='RJGC']/pair") as $pair) {
array_push(self::$_rjgcSearch, (string)$pair->search);
array_push(self::$_rjgcReplace, (string)$pair->replace);
}
foreach ($xml->xpath("//str_replace[@function='SES']/pair") as $pair) {
array_push(self::$_sesSearch, (string)$pair->search);
array_push(self::$_sesReplace, (string)$pair->replace);
}
foreach ($xml->xpath("//str_replace[@function='ar2en']/pair") as $pair) {
array_push(self::$_ar2enStrSearch, (string)$pair->search);
array_push(self::$_ar2enStrReplace, (string)$pair->replace);
}
foreach ($xml->xpath("//preg_replace[@function='en2ar']/pair") as $pair) {
array_push(self::$_en2arPregSearch, (string)$pair->search);
array_push(self::$_en2arPregReplace, (string)$pair->replace);
}
foreach ($xml->xpath("//str_replace[@function='en2ar']/pair") as $pair) {
array_push(self::$_en2arStrSearch, (string)$pair->search);
array_push(self::$_en2arStrReplace, (string)$pair->replace);
}
}
/**
* Transliterate English string into Arabic by render them in the
* orthography of the Arabic language
*
* @param string $string English string you want to transliterate
*
* @return String Out of vocabulary English string in Arabic characters
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public static function en2ar($string)
{
$string = strtolower($string);
$words = explode(' ', $string);
$string = '';
foreach ($words as $word) {
$word = preg_replace(
self::$_en2arPregSearch,
self::$_en2arPregReplace, $word
);
$word = str_replace(
self::$_en2arStrSearch,
self::$_en2arStrReplace,
$word
);
$string .= ' ' . $word;
}
return $string;
}
/**
* Transliterate Arabic string into English by render them in the
* orthography of the English language
*
* @param string $string Arabic string you want to transliterate
* @param string $standard Transliteration standard, default is UNGEGN
* and possible values are [UNGEGN, UNGEGN+, RJGC,
* SES, ISO233]
*
* @return String Out of vocabulary Arabic string in English characters
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public static function ar2en($string, $standard='UNGEGN')
{
//$string = str_replace('ة ال', 'tul', $string);
$words = explode(' ', $string);
$string = '';
for ($i=0; $i<count($words)-1; $i++) {
$words[$i] = str_replace('ة', 'ت', $words[$i]);
}
foreach ($words as $word) {
$temp = $word;
if ($standard == 'UNGEGN+') {
$temp = str_replace(
self::$_diariticalSearch,
self::$_diariticalReplace,
$temp
);
} else if ($standard == 'RJGC') {
$temp = str_replace(
self::$_diariticalSearch,
self::$_diariticalReplace,
$temp
);
$temp = str_replace(
self::$_rjgcSearch,
self::$_rjgcReplace,
$temp
);
} else if ($standard == 'SES') {
$temp = str_replace(
self::$_diariticalSearch,
self::$_diariticalReplace,
$temp
);
$temp = str_replace(
self::$_sesSearch,
self::$_sesReplace,
$temp
);
} else if ($standard == 'ISO233') {
$temp = str_replace(
self::$_iso233Search,
self::$_iso233Replace,
$temp
);
}
$temp = preg_replace(
self::$_ar2enPregSearch,
self::$_ar2enPregReplace,
$temp
);
$temp = str_replace(
self::$_ar2enStrSearch,
self::$_ar2enStrReplace,
$temp
);
$temp = preg_replace(
self::$_arFinePatterns,
self::$_arFineReplacements,
$temp
);
if (preg_match('/[a-z]/', mb_substr($temp, 0, 1))) {
$temp = ucwords($temp);
}
$pos = strpos($temp, '-');
if ($pos > 0) {
if (preg_match('/[a-z]/', mb_substr($temp, $pos+1, 1))) {
$temp2 = substr($temp, 0, $pos);
$temp2 .= '-'.strtoupper($temp[$pos+1]);
$temp2 .= substr($temp, $pos+2);
} else {
$temp2 = $temp;
}
} else {
$temp2 = $temp;
}
$string .= ' ' . $temp2;
}
return $string;
}
/**
* Render numbers in given string using HTML entities that will show them as
* Arabic digits (i.e. 1, 2, 3, etc.) whatever browser language settings are
* (if browser supports UTF-8 character set).
*
* @param string $string String includes some digits here or there
*
* @return String Original string after replace digits by HTML entities that
* will show given number using Indian digits
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public static function enNum($string)
{
$html = '';
$digits = str_split("$string");
foreach ($digits as $digit) {
$html .= preg_match('/\d/', $digit) ? "$digit;" : $digit;
}
return $html;
}
/**
* Render numbers in given string using HTML entities that will show them as
* Indian digits (i.e. ١, ٢, ٣, etc.) whatever browser language settings are
* (if browser supports UTF-8 character set).
*
* @param string $string String includes some digits here or there
*
* @return String Original string after replace digits by HTML entities that
* will show given number using Arabic digits
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public static function arNum($string)
{
$html = '';
$digits = str_split("$string");
foreach ($digits as $digit) {
$html .= preg_match('/\d/', $digit) ? "f$digit;" : $digit;
}
return $html;
}
}