AuthorBil Corry
LicensePublic Domain
Posted17 Aug 2007
Updated17 Aug 2007
Charset-aware include_url, returns string instead of bytes.  Use it the same as you would [include_url], with all the same parameters.  It will sniff the headers and meta on the page to try to discover the charset, then translate it to the outbound charset.  If no charset is discovered, it will default to UTF-8.

Optionally, you can specify the charset of the page you're trying to include.

This example will serve an UTF-8 HTML document, but includes a SHIFT_JIS HTML document as it's source.

content_type:'text/html; charset=utf-8';

    -description='Charset-aware include_url, returns string instead of bytes.',

    local:'lp_params' = params;
    local:'lp_headers' = '_lp_include_url_headers';
    if: #lp_params->(find:'-RetrieveMIMEHeaders')->size;
        #lp_headers = #lp_params->(find:'-RetrieveMIMEHeaders')->(get:1)->value;
	local:'lp_charset' = string;
	if: params->(find:'-charset')->size && params->(find:'-charset')->(get:1)->type == 'pair';
		local:'lp_charset' = #lp_params->(find:'-charset')->(get:1)->value;

    local:'lp_bytes' = @\include_url->(run: -name='include_url', -owner='include_url', -params=#lp_params);

	if: #lp_charset->size == 0;
	    #lp_charset = (string_findregexp: (var: #lp_headers), -find='(?i)charset\\s*=\\s*([\\w\\-]+)');
	    if: #lp_charset->size == 2;
	        #lp_charset = #lp_charset->(get:2);
	    else;  // charset not found in headers, try meta on page
	    	local:'lp_page_top' = (string: #lp_bytes->(getrange: 1, 1024)); // only look in first 1k of page
		    #lp_charset = (string_findregexp: #lp_page_top, -find='(?i)charset\\s*=\\s*([\\w\\-]+)');
		    if: #lp_charset->size == 2;
		        #lp_charset = #lp_charset->(get:2);
		        #lp_charset = 'utf-8';  // default is utf-8 if all else fails

    if: #lp_charset == 'utf-8';
        return: (string: #lp_bytes);
        return: #lp_bytes->(exportstring:#lp_charset);



