2017-11-16 17 views
1

私はデコードするためにこのコードを試しています。以下は特殊文字(デコードユニコード)のREGEX

 CREATE TEMP FUNCTION DecodeUnicode(s STRING) AS (
     IF(s NOT LIKE '%\\u%', s, 
     (SELECT CODE_POINTS_TO_STRING(ARRAY_AGG(CAST(CONCAT('0x', x) AS INT64))) 
     FROM UNNEST(SPLIT(s, '\\u')) AS x 
     WHERE x != '')) 
    ); 

     SELECT 
     original, 
     DecodeUnicode(original) AS decoded 
     FROM (
     SELECT trim(r'$-\u6599\u91d1\u304c\u9ad8\u3059\u304e\uff01\uff01\uff01') AS original UNION ALL 
     SELECT trim(r'abcd') 
    ); 

答えて

1

BigQueryの標準SQL

ため それは何
#standardSQL 
CREATE TEMP FUNCTION DecodeUnicode(s STRING) AS (
    (SELECT CODE_POINTS_TO_STRING(ARRAY_AGG(CAST(CONCAT('0x', x) AS INT64))) 
    FROM UNNEST(SPLIT(s, '\\u')) AS x 
    WHERE x != '' 
) 
); 
WITH `yourTable` AS (
    SELECT r'$-\u6599\u91d1\u304c\u9ad8\u3059\u304e\uff01\uff01\uff01' AS original UNION ALL 
    SELECT r'abcd' 
), uchars AS (
    SELECT DISTINCT 
    c, 
    DecodeUnicode(c) uchar 
    FROM `yourTable`, 
    UNNEST(REGEXP_EXTRACT_ALL(original, r'(\\u[abcdef0-9]{4})')) c 
) 
SELECT 
    original, 
    STRING_AGG(IFNULL(uchar, x), '' ORDER BY pos) decoded 
FROM (
    SELECT 
    original, 
    pos, 
    SUBSTR(original, 
     SUM(CASE char WHEN '' THEN 1 ELSE 6 END) 
     OVER(PARTITION BY original ORDER BY pos) - CASE char WHEN '' THEN 0 ELSE 5 END, 
     CASE char WHEN '' THEN 1 ELSE 6 END) x, 
    uchar 
    FROM `yourTable`, 
    UNNEST(REGEXP_EXTRACT_ALL(original, r'(\\u[abcdef0-9]{4})|.')) char WITH OFFSET AS pos 
    LEFT JOIN uchars u ON u.c = char 
) 
GROUP BY original 
-- ORDER BY original  

である - それはすべてのUnicode文字を抽出し、それらをデコードするので、そのまま非ユニコードが滞在残して元の文字列でそれらを置き換えます出力は以下のようになります。

original             decoded 
$-\u6599\u91d1\u304c\u9ad8\u3059\u304e\uff01\uff01\uff01  $-料金が高すぎ!!!  
abcd              abcd  
+0

また、 –