Url解析组件重构及优化

This commit is contained in:
zhaoqichao
2023-07-05 09:29:21 +08:00
parent c47b48760d
commit 3c77ed3c5f
10 changed files with 218 additions and 114 deletions

View File

@@ -3,14 +3,15 @@ package com.abin.mallchat.common.common.utils.discover;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import com.abin.mallchat.common.common.utils.FutureUtils;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.data.util.Pair;
import javax.annotation.Nullable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -20,46 +21,55 @@ import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Description: urlTitle查询抽象类
* Author: <a href="https://github.com/zongzibinbin">abin</a>
* Date: 2023-05-27
* @author zhaoqichao
* @date 2023/7/3 16:38
*/
@Slf4j
public abstract class AbstractUrlTitleDiscover implements UrlTitleDiscover {
public abstract class AbstractUrlDiscover implements UrlDiscover {
//链接识别的正则
private static final Pattern PATTERN = Pattern.compile("((http|https)://)?(www.)?([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?");
@Nullable
@javax.annotation.Nullable
@Override
public Map<String, String> getContentTitleMap(String content) {
public Map<String, UrlInfo> getUrlContentMap(String content) {
if (StrUtil.isBlank(content)) {
return new HashMap<>();
}
List<String> matchList = ReUtil.findAll(PATTERN, content, 0);
//并行请求
List<CompletableFuture<Pair<String, String>>> futures = matchList.stream().map(match -> CompletableFuture.supplyAsync(() -> {
String title = getUrlTitle(match);
return StringUtils.isNotEmpty(title) ? Pair.of(match, title) : null;
List<CompletableFuture<Pair<String, UrlInfo>>> futures = matchList.stream().map(match -> CompletableFuture.supplyAsync(() -> {
UrlInfo urlInfo = getContent(match);
return Objects.isNull(urlInfo) ? null : Pair.of(match, urlInfo);
})).collect(Collectors.toList());
CompletableFuture<List<Pair<String, String>>> future = FutureUtils.sequenceNonNull(futures);
CompletableFuture<List<Pair<String, UrlInfo>>> future = FutureUtils.sequenceNonNull(futures);
//结果组装
return future.join().stream().collect(Collectors.toMap(Pair::getFirst, Pair::getSecond, (a, b) -> a));
}
@Nullable
@Override
public String getUrlTitle(String url) {
public UrlInfo getContent(String url) {
Document document = getUrlDocument(assemble(url));
if (Objects.isNull(document)) {
return null;
}
return getDocTitle(document);
return UrlInfo.builder()
.title(getTitle(document))
.description(getDescription(document))
.image(getImage(assemble(url),document)).build();
}
private String assemble(String url) {
if (!StrUtil.startWith(url, "http")) {
return "http://" + url;
}
return url;
}
@@ -69,7 +79,7 @@ public abstract class AbstractUrlTitleDiscover implements UrlTitleDiscover {
connect.timeout(2000);
return connect.get();
} catch (Exception e) {
log.error("find title error:url:{}", matchUrl, e);
log.error("find error:url:{}", matchUrl, e);
}
return null;
}

View File

@@ -0,0 +1,56 @@
package com.abin.mallchat.common.common.utils.discover;
import cn.hutool.core.util.StrUtil;
import io.jsonwebtoken.lang.Objects;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* @author zhaoqichao
* @date 2023/7/3 16:54
*/
public class CommonUrlDiscover extends AbstractUrlDiscover {
@Nullable
@Override
public String getTitle(Document document) {
return document.title();
}
@Nullable
@Override
public String getDescription(Document document) {
String description = document.head().select("meta[name=description]").attr("content");
String keywords = document.head().select("meta[name=keywords]").attr("content");
String content = StrUtil.isNotBlank(description) ? description : keywords;
//只保留一句话的描述
return StrUtil.isNotBlank(content) ? content.substring(0, content.indexOf("")) : content;
}
@Nullable
@Override
public String getImage(String url, Document document) {
//如果包含og则是微信链接
if(StrUtil.isNotBlank(document.getElementsByAttributeValue("property", "og:title").attr("content"))){
return null;
}
String image = document.select("link[type=image/x-icon]").attr("href");
//如果没有去匹配含有icon属性的logo
String href = StrUtil.isEmpty(image) ? document.select("link[rel$=icon]").attr("href") : image;
//如果icon中已经包含了url部分域名
if (StrUtil.isNotBlank(StrUtil.removeAny(StrUtil.removeAny(href, "/"), "favicon.ico")) &&
StrUtil.containsAny(StrUtil.removePrefix(url, "http://"), StrUtil.removeAny(StrUtil.removeAny(href, "/"), "favicon.ico"))) {
return "http://" + StrUtil.removePrefix(href, "/");
}
//如果url已经包含了logo
if (StrUtil.containsAny(url, "favicon")) {
return url;
}
//如果logo中有url
if (StrUtil.containsAny(href, "http") || StrUtil.containsAny(href, "https")) {
return href;
}
return StrUtil.format("{}/{}", url, StrUtil.removePrefix(href, "/"));
}
}

View File

@@ -1,15 +0,0 @@
package com.abin.mallchat.common.common.utils.discover;
import org.jsoup.nodes.Document;
/**
* Description: 通用的标题解析类
* Author: <a href="https://github.com/zongzibinbin">abin</a>
* Date: 2023-05-27
*/
public class CommonUrlTitleDiscover extends AbstractUrlTitleDiscover {
@Override
public String getDocTitle(Document document) {
return document.title();
}
}

View File

@@ -0,0 +1,60 @@
package com.abin.mallchat.common.common.utils.discover;
import cn.hutool.core.util.StrUtil;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.List;
/**
* Description: 具有优先级的title查询器
* Author: <a href="https://github.com/zongzibinbin">abin</a>
* Date: 2023-05-27
*/
public class PrioritizedUrlDiscover extends AbstractUrlDiscover {
private final List<UrlDiscover> urlTitleDiscovers = new ArrayList<>(2);
public PrioritizedUrlDiscover() {
urlTitleDiscovers.add(new CommonUrlDiscover());
urlTitleDiscovers.add(new WxUrlDiscover());
}
@Nullable
@Override
public String getTitle(Document document) {
for (UrlDiscover urlDiscover : urlTitleDiscovers) {
String urlTitle = urlDiscover.getTitle(document);
if (StrUtil.isNotBlank(urlTitle)) {
return urlTitle;
}
}
return null;
}
@Nullable
@Override
public String getDescription(Document document) {
for (UrlDiscover urlDiscover : urlTitleDiscovers) {
String urlDescription = urlDiscover.getDescription(document);
if (StrUtil.isNotBlank(urlDescription)) {
return urlDescription;
}
}
return null;
}
@Nullable
@Override
public String getImage(String url, Document document) {
for (UrlDiscover urlDiscover : urlTitleDiscovers) {
String urlImage = urlDiscover.getImage(url,document);
if (StrUtil.isNotBlank(urlImage)) {
return urlImage;
}
}
return null;
}
}

View File

@@ -1,33 +0,0 @@
package com.abin.mallchat.common.common.utils.discover;
import cn.hutool.core.util.StrUtil;
import org.jsoup.nodes.Document;
import java.util.ArrayList;
import java.util.List;
/**
* Description: 具有优先级的title查询器
* Author: <a href="https://github.com/zongzibinbin">abin</a>
* Date: 2023-05-27
*/
public class PrioritizedUrlTitleDiscover extends AbstractUrlTitleDiscover {
private final List<UrlTitleDiscover> urlTitleDiscovers = new ArrayList<>(2);
public PrioritizedUrlTitleDiscover() {
urlTitleDiscovers.add(new CommonUrlTitleDiscover());
urlTitleDiscovers.add(new WxUrlTitleDiscover());
}
@Override
public String getDocTitle(Document document) {
for (UrlTitleDiscover urlTitleDiscover : urlTitleDiscovers) {
String urlTitle = urlTitleDiscover.getDocTitle(document);
if (StrUtil.isNotBlank(urlTitle)) {
return urlTitle;
}
}
return null;
}
}

View File

@@ -0,0 +1,46 @@
package com.abin.mallchat.common.common.utils.discover;
import cn.hutool.core.date.StopWatch;
import cn.hutool.core.util.StrUtil;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import org.jsoup.nodes.Document;
import javax.annotation.Nullable;
import java.util.Map;
/**
* @author zhaoqichao
* @date 2023/7/3 16:34
*/
public interface UrlDiscover {
@Nullable
Map<String,UrlInfo> getUrlContentMap(String content);
@Nullable
UrlInfo getContent(String url);
@Nullable
String getTitle(Document document);
@Nullable
String getDescription(Document document);
@Nullable
String getImage(String url, Document document);
public static void main(String[] args) {
StopWatch stopWatch = new StopWatch();
stopWatch.start();
String longStr = "其中包含一个URL www.baidu.com,一个带有端口号的URL http://www.jd.com:80, 一个带有路径的URL http://mallchat.cn, 还有美团技术文章https://mp.weixin.qq.com/s/hwTf4bDck9_tlFpgVDeIKg ";
// String longStr = "一个带有端口号的URL http://www.jd.com:80,";
// String longStr = "一个带有路径的URL http://mallchat.cn";
PrioritizedUrlDiscover discover = new PrioritizedUrlDiscover();
final Map<String, UrlInfo> map = discover.getUrlContentMap(longStr);
System.out.println(map);
stopWatch.stop();
long cost = stopWatch.getTotalTimeMillis();
System.out.println(cost);
}
}

View File

@@ -1,35 +0,0 @@
package com.abin.mallchat.common.common.utils.discover;
import cn.hutool.core.date.StopWatch;
import org.jsoup.nodes.Document;
import javax.annotation.Nullable;
import java.util.Map;
public interface UrlTitleDiscover {
@Nullable
Map<String, String> getContentTitleMap(String content);
@Nullable
String getUrlTitle(String url);
@Nullable
String getDocTitle(Document document);
public static void main(String[] args) {//用异步多任务查询并合并 974 //串行访问的速度1349 1291 1283 1559
StopWatch stopWatch = new StopWatch();
stopWatch.start();
String longStr = "这是一个很长的字符串再来 www.github.com其中包含一个URL www.baidu.com,, 一个带有端口号的URL http://www.jd.com:80, 一个带有路径的URL http://mallchat.cn, 还有美团技术文章https://mp.weixin.qq.com/s/hwTf4bDck9_tlFpgVDeIKg ";
PrioritizedUrlTitleDiscover discover = new PrioritizedUrlTitleDiscover();
Map<String, String> contentTitleMap = discover.getContentTitleMap(longStr);
System.out.println(contentTitleMap);
//
// Jsoup.connect("http:// www.github.com");
stopWatch.stop();
long cost = stopWatch.getTotalTimeMillis();
System.out.println(cost);
}//{http://mallchat.cn=MallChat, www.baidu.com=百度一下,你就知道, https://mp.weixin.qq.com/s/hwTf4bDck9_tlFpgVDeIKg=超大规模数据库集群保稳系列之二:数据库攻防演练建设实践, http://www.jd.com:80=京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!}
}

View File

@@ -0,0 +1,30 @@
package com.abin.mallchat.common.common.utils.discover;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
/**
* Description: 针对微信公众号文章的标题获取类
* Author: <a href="https://github.com/zongzibinbin">abin</a>
* Date: 2023-05-27
*/
public class WxUrlDiscover extends AbstractUrlDiscover {
@Nullable
@Override
public String getTitle(Document document) {
return document.getElementsByAttributeValue("property", "og:title").attr("content");
}
@Nullable
@Override
public String getDescription(Document document) {
return document.getElementsByAttributeValue("property", "og:description").attr("content");
}
@Nullable
@Override
public String getImage(String url, Document document) {
return document.getElementsByAttributeValue("property", "og:image").attr("content");
}
}

View File

@@ -1,15 +0,0 @@
package com.abin.mallchat.common.common.utils.discover;
import org.jsoup.nodes.Document;
/**
* Description: 针对微信公众号文章的标题获取类
* Author: <a href="https://github.com/zongzibinbin">abin</a>
* Date: 2023-05-27
*/
public class WxUrlTitleDiscover extends AbstractUrlTitleDiscover {
@Override
public String getDocTitle(Document document) {
return document.getElementsByAttributeValue("property", "og:title").attr("content");
}
}