diff --git a/include/functions.inc.php b/include/functions.inc.php index 12be821e7..4e5c848e9 100644 --- a/include/functions.inc.php +++ b/include/functions.inc.php @@ -203,21 +203,30 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT) /* Returns true if the string appears to be encoded in UTF-8. (from wordpress) * @param string Str */ -function seems_utf8($Str) { # by bmorel at ssi dot fr +function seems_utf8($Str) { + // OBSOLETE !!! + return qualify_utf8($Str) >= 0; +} + +/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */ +function qualify_utf8($Str) +{ + $ret = 0; for ($i=0; $i 0 ) { $chars = array( // Decompositions for Latin-1 Supplement "\xc3\x80"=>'A', "\xc3\x81"=>'A', @@ -323,6 +333,9 @@ function remove_accents($string) "\xc5\xba"=>'z', "\xc5\xbb"=>'Z', "\xc5\xbc"=>'z', "\xc5\xbd"=>'Z', "\xc5\xbe"=>'z', "\xc5\xbf"=>'s', + // Decompositions for Latin Extended-B + "\xc8\x98"=>'S', "\xc8\x99"=>'s', + "\xc8\x9a"=>'T', "\xc8\x9b"=>'t', // Euro Sign "\xe2\x82\xac"=>'E', // GBP (Pound) Sign @@ -353,6 +366,23 @@ function remove_accents($string) return $string; } +if (function_exists('mb_strtolower') && defined('PWG_CHARSET')) +{ + function transliterate($term) + { + return remove_accents( mb_strtolower($term, PWG_CHARSET) ); + } +} +else +{ + function transliterate($term) + { + return remove_accents( strtolower($term) ); + } +} + + + /** * simplify a string to insert it into an URL * @@ -361,16 +391,14 @@ function remove_accents($string) */ function str2url($str) { - $raw = $str; - - $str = remove_accents($str); - $str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str)); + $str = $safe = transliterate($str); + $str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str); $str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str)); $res = str_replace(' ','_',$str); if (empty($res)) { - $res = str_replace(' ','_', $raw); + $res = str_replace(' ','_', $safe); } return $res; diff --git a/include/functions_html.inc.php b/include/functions_html.inc.php index 7808045d9..8450b4c82 100644 --- a/include/functions_html.inc.php +++ b/include/functions_html.inc.php @@ -300,7 +300,7 @@ function tag_alpha_compare($a, $b) { if (!isset($cache[__FUNCTION__][ $tag['name'] ])) { - $cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name'])); + $cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']); } } diff --git a/include/functions_metadata.inc.php b/include/functions_metadata.inc.php index e8935aefd..5a8671d77 100644 --- a/include/functions_metadata.inc.php +++ b/include/functions_metadata.inc.php @@ -90,10 +90,12 @@ function clean_iptc_value($value) // apparently mac uses some MacRoman crap encoding. I don't know // how to detect it so a plugin should do the trick. $value = trigger_event('clean_iptc_value', $value); - $is_utf8 = seems_utf8($value); - $value = convert_charset( $value, - $is_utf8 ? 'utf-8' : 'iso-8859-1', - get_pwg_charset() ); + if ( ($qual = qualify_utf8($value)) != 0) + {// has non ascii chars + $value = convert_charset( $value, + $qual>0 ? 'utf-8' : 'iso-8859-1', + get_pwg_charset() ); + } } return $value; } diff --git a/include/functions_search.inc.php b/include/functions_search.inc.php index f25cd4670..db54dc767 100644 --- a/include/functions_search.inc.php +++ b/include/functions_search.inc.php @@ -266,21 +266,6 @@ SELECT DISTINCT(id) } -if (function_exists('mb_strtolower')) -{ - function transliterate($term) - { - return remove_accents( mb_strtolower($term) ); - } -} -else -{ - function transliterate($term) - { - return remove_accents( strtolower($term) ); - } -} - function is_word_char($ch) { return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127; diff --git a/tags.php b/tags.php index da61d6cd3..b19bce123 100644 --- a/tags.php +++ b/tags.php @@ -99,7 +99,7 @@ if ($page['display_mode'] == 'letters') { foreach ($tags as $tag) { - $tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8')); + $tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET); if ($current_tag_idx==0) { $current_letter = $tag_letter;