

代码1:
public static String DecodeData(WebResponse w)
{
//
// first see if content length header has charset = calue
//
String charset = null;
String ctype = w.Headers["content-type"];
if (ctype != null)
{
int ind = ctype.IndexOf("charset=");
if (ind != -1)
{
charset = ctype.Substring(ind + 8);
Console.WriteLine("CT: charset=" + charset);
}
}
// save data to a memorystream
MemoryStream rawdata = new MemoryStream();
byte[] buffer = new byte[1024];
Stream rs = w.GetResponseStream();
int read = rs.Read(buffer, 0, buffer.Length);
while (read > 0)
{
rawdata.Write(buffer, 0, read);
read = rs.Read(buffer, 0, buffer.Length);
}
rs.Close();
//
// if ContentType is null, or did not contain charset, we search in body
//
if (charset == null)
{
MemoryStream ms = rawdata;
ms.Seek(0, SeekOrigin.Begin);
StreamReader srr = new StreamReader(ms, Encoding.ASCII);
String meta = srr.ReadToEnd();
if (meta != null)
{
int start_ind = meta.IndexOf("charset=");
int end_ind = -1;
if (start_ind != -1)
{
end_ind = meta.IndexOf("\"", start_ind);
if (end_ind != -1)
{
int start = start_ind + 8;
charset = meta.Substring(start, end_ind - start + 1);
charset = charset.TrimEnd(new Char[] { '>', '"' });
Console.WriteLine("META: charset=" + charset);
}
}
}
}
Encoding e = null;
if (charset == null)
{
e = Encoding.ASCII; //default encoding
}
else
{
try
{
e = Encoding.GetEncoding(charset);
}
catch (Exception ee)
{
Console.WriteLine("Exception: GetEncoding: " + charset);
Console.WriteLine(ee.ToString());
e = Encoding.ASCII;
}
}
rawdata.Seek(0, SeekOrigin.Begin);
StreamReader sr = new StreamReader(rawdata, e);
String s = sr.ReadToEnd();
return s.ToLower();
}
代码2:
public static string DownloadPage(string url)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
using (HttpWebResponse resp = (HttpWebResponse)req.GetResponse())
{
byte[] buffer;
using (Stream s = resp.GetResponseStream())
{
buffer = ReadStream(s);
}
string pageEncoding = "";
Encoding e = Encoding.UTF8;
if (resp.ContentEncoding != "")
pageEncoding = resp.ContentEncoding;
else if (resp.CharacterSet != "")
pageEncoding = resp.CharacterSet;
else if (resp.ContentType != "")
pageEncoding = GetCharacterSet(resp.ContentType);
if (pageEncoding == "")
pageEncoding = GetCharacterSet(buffer);
if (pageEncoding != "")
{
try
{
e = Encoding.GetEncoding(pageEncoding);
}
catch
{
return null;
}
}
string data = e.GetString(buffer);
return data;
}
}
private static string GetCharacterSet(string s)
{
s = s.ToUpper();
int start = s.LastIndexOf("CHARSET");
if (start == -1)
return "";
start = s.IndexOf("=", start);
if (start == -1)
return "";
start++;
s = s.Substring(start).Trim();
int end = s.Length;
int i = s.IndexOf(";");
if (i != -1)
end = i;
i = s.IndexOf("\"");
if (i != -1 && i < end)
end = i;
i = s.IndexOf("'");
if (i != -1 && i < end)
end = i;
i = s.IndexOf("/");
if (i != -1 && i < end)
end = i;
return s.Substring(0, end).Trim();
}
private static string GetCharacterSet(byte[] data)
{
string s = Encoding.Default.GetString(data);
return GetCharacterSet(s);
}
private static byte[] ReadStream(Stream s)
{
try
{
byte[] buffer = new byte[8096];
using (MemoryStream ms = new MemoryStream())
{
while (true)
{
int read = s.Read(buffer, 0, buffer.Length);
if (read <= 0)
{
return ms.ToArray();
}
ms.Write(buffer, 0, read);
}
}
}
catch (Exception)
{
return null;
}
}
两段代码都来自国外论坛,也许分析读取到的页面字符流本身特征可以找出匹配的编码?这就相当麻烦了。
原创文章,作者:苏葳,如需转载,请注明出处:https://www.swmemo.com/588.html
