Jeg ved godt at spørgsmålet er halvandet år gammelt, men hvis nogen finder det
ved søgning, så er her en løsning:
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpDownloadCharset {
private static Pattern encpat = Pattern.compile("charset=([A-Za-z0-9-]+)", Pattern.CASE_INSENSITIVE);
private static String parseContentType(String contenttype) {
Matcher m = encpat.matcher(contenttype);
if(m.find()) {
return m.group(1);
} else {
return "ISO-8859-1";
}
}
private static Pattern metaencpat = Pattern.compile("<META\\s+HTTP-EQUIV\\s*=\\s*[\"']Content-Type[\"']\\s+CONTENT\\s*=\\s*[\"']([^\"']*)[\"']>", Pattern.CASE_INSENSITIVE);
private static String parseMetaContentType(String html, String defenc) {
Matcher m = metaencpat.matcher(html);
if(m.find()) {
return parseContentType(m.group(1));
} else {
return defenc;
}
}
private static final int DEFAULT_BUFSIZ = 1000000;
public static String download(String urlstr) throws IOException {
URL url = new URL(urlstr);
HttpURLConnection con = (HttpURLConnection)url.openConnection();
con.connect();
if (con.getResponseCode() == HttpURLConnection.HTTP_OK) {
String enc = parseContentType(con.getContentType());
int bufsiz = con.getContentLength();
if(bufsiz < 0) {
bufsiz = DEFAULT_BUFSIZ;
}
byte[] buf = new byte[bufsiz];
InputStream is = con.getInputStream();
int ix = 0;
int n;
while((n = is.read(buf, ix, buf.length - ix)) > 0) {
ix += n;
}
is.close();
con.disconnect();
String temp = new String(buf, "US-ASCII");
enc = parseMetaContentType(temp, enc);
return new String(buf, enc);
} else {
con.disconnect();
throw new IllegalArgumentException("URL " + urlstr + " returned " + con.getResponseMessage());
}
}
public static void main(String[] args) throws Exception {
System.out.println(download("
http://arne:81/~arne/f1.html"));
System.out.println(download("
http://arne:81/~arne/f2.html"));
System.out.println(download("
http://arne:81/~arne/f3.html"));
}
}