代码1:
public static String DecodeData(WebResponse w) { // // first see if content length header has charset = calue // String charset = null; String ctype = w.Headers["content-type"]; if (ctype != null) { int ind = ctype.IndexOf("charset="); if (ind != -1) { charset = ctype.Substring(ind + 8); Console.WriteLine("CT: charset=" + charset); } } // save data to a memorystream MemoryStream rawdata = new MemoryStream(); byte[] buffer = new byte[1024]; Stream rs = w.GetResponseStream(); int read = rs.Read(buffer, 0, buffer.Length); while (read > 0) { rawdata.Write(buffer, 0, read); read = rs.Read(buffer, 0, buffer.Length); } rs.Close(); // // if ContentType is null, or did not contain charset, we search in body // if (charset == null) { MemoryStream ms = rawdata; ms.Seek(0, SeekOrigin.Begin); StreamReader srr = new StreamReader(ms, Encoding.ASCII); String meta = srr.ReadToEnd(); if (meta != null) { int start_ind = meta.IndexOf("charset="); int end_ind = -1; if (start_ind != -1) { end_ind = meta.IndexOf("\"", start_ind); if (end_ind != -1) { int start = start_ind + 8; charset = meta.Substring(start, end_ind - start + 1); charset = charset.TrimEnd(new Char[] { '>', '"' }); Console.WriteLine("META: charset=" + charset); } } } } Encoding e = null; if (charset == null) { e = Encoding.ASCII; //default encoding } else { try { e = Encoding.GetEncoding(charset); } catch (Exception ee) { Console.WriteLine("Exception: GetEncoding: " + charset); Console.WriteLine(ee.ToString()); e = Encoding.ASCII; } } rawdata.Seek(0, SeekOrigin.Begin); StreamReader sr = new StreamReader(rawdata, e); String s = sr.ReadToEnd(); return s.ToLower(); }
代码2:
public static string DownloadPage(string url) { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse()) { byte[] buffer; using (Stream s = resp.GetResponseStream()) { buffer = ReadStream(s); } string pageEncoding = ""; Encoding e = Encoding.UTF8; if (resp.ContentEncoding != "") pageEncoding = resp.ContentEncoding; else if (resp.CharacterSet != "") pageEncoding = resp.CharacterSet; else if (resp.ContentType != "") pageEncoding = GetCharacterSet(resp.ContentType); if (pageEncoding == "") pageEncoding = GetCharacterSet(buffer); if (pageEncoding != "") { try { e = Encoding.GetEncoding(pageEncoding); } catch { return null; } } string data = e.GetString(buffer); return data; } } private static string GetCharacterSet(string s) { s = s.ToUpper(); int start = s.LastIndexOf("CHARSET"); if (start == -1) return ""; start = s.IndexOf("=", start); if (start == -1) return ""; start++; s = s.Substring(start).Trim(); int end = s.Length; int i = s.IndexOf(";"); if (i != -1) end = i; i = s.IndexOf("\""); if (i != -1 && i < end) end = i; i = s.IndexOf("'"); if (i != -1 && i < end) end = i; i = s.IndexOf("/"); if (i != -1 && i < end) end = i; return s.Substring(0, end).Trim(); } private static string GetCharacterSet(byte[] data) { string s = Encoding.Default.GetString(data); return GetCharacterSet(s); } private static byte[] ReadStream(Stream s) { try { byte[] buffer = new byte[8096]; using (MemoryStream ms = new MemoryStream()) { while (true) { int read = s.Read(buffer, 0, buffer.Length); if (read <= 0) { return ms.ToArray(); } ms.Write(buffer, 0, read); } } } catch (Exception) { return null; } }
两段代码都来自国外论坛,也许分析读取到的页面字符流本身特征可以找出匹配的编码?这就相当麻烦了。
原创文章,作者:苏葳,如需转载,请注明出处:https://www.swmemo.com/588.html