fix:url解析图片时进行链接有效性校验

This commit is contained in:
zhaoqichao
2023-07-12 10:24:20 +08:00
parent eddee5647f
commit 4db52c35ce
3 changed files with 43 additions and 9 deletions

View File

@@ -11,6 +11,9 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.data.util.Pair;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -59,7 +62,7 @@ public abstract class AbstractUrlDiscover implements UrlDiscover {
return UrlInfo.builder()
.title(getTitle(document))
.description(getDescription(document))
.image(getImage(assemble(url),document)).build();
.image(getImage(assemble(url), document)).build();
}
@@ -83,4 +86,32 @@ public abstract class AbstractUrlDiscover implements UrlDiscover {
return null;
}
/**
* 判断链接是否有效
* 输入链接
* 返回true或者false
*/
public static boolean isConnect(String href) {
//请求地址
URL url;
//请求状态码
int state;
//下载链接类型
String fileType;
try {
url = new URL(href);
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
state = httpURLConnection.getResponseCode();
fileType = httpURLConnection.getHeaderField("Content-Disposition");
//如果成功200缓存304移动302都算有效链接并且不是下载链接
if ((state == 200 || state == 302 || state == 304) && fileType == null) {
return true;
}
httpURLConnection.disconnect();
} catch (Exception e) {
return false;
}
return false;
}
}

View File

@@ -7,6 +7,10 @@ import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
/**
* @author zhaoqichao
* @date 2023/7/3 16:54
@@ -34,19 +38,17 @@ public class CommonUrlDiscover extends AbstractUrlDiscover {
String image = document.select("link[type=image/x-icon]").attr("href");
//如果没有去匹配含有icon属性的logo
String href = StrUtil.isEmpty(image) ? document.select("link[rel$=icon]").attr("href") : image;
//如果icon中已经包含了url部分域名
if (StrUtil.isNotBlank(StrUtil.removeAny(StrUtil.removeAny(href, "/"), "favicon.ico")) &&
StrUtil.containsAny(StrUtil.removePrefix(url, "http://"), StrUtil.removeAny(StrUtil.removeAny(href, "/"), "favicon.ico"))) {
return "http://" + StrUtil.removePrefix(href, "/");
}
//如果url已经包含了logo
if (StrUtil.containsAny(url, "favicon")) {
return url;
}
//如果logo中有url
if (StrUtil.containsAny(href, "http") || StrUtil.containsAny(href, "https")) {
//如果icon可以直接访问或者包含了http
if (isConnect(!StrUtil.startWith(href, "http") ? "http:" + href : href)) {
return href;
}
return StrUtil.format("{}/{}", url, StrUtil.removePrefix(href, "/"));
}
}

View File

@@ -25,6 +25,7 @@ public class WxUrlDiscover extends AbstractUrlDiscover {
@Nullable
@Override
public String getImage(String url, Document document) {
return document.getElementsByAttributeValue("property", "og:image").attr("content");
String href = document.getElementsByAttributeValue("property", "og:image").attr("content");
return isConnect(href) ? href: null;
}
}