java中利用正則,過濾網頁標籤.......
阿新 • • 發佈:2019-01-29
在開發中有時候會遇到在一大串字串中替換或者去除某個特定的字串,一下例子是過濾html頁面字串的例項,說明正則在其中的作用:
package com.project.admin.common.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* html標籤
* @author xwp
*
*/
public class HtmlUtil {
//去除某種視訊引用方法,然後替換成新的
private static final String pc_embed_first="http://player.youku.com/player.php/sid/";
private static final String pc_embed_end="==/isShowRelatedVideo";
private static final String embed_tag="<embed[^>]*?[\\s\\S]*? \\/>";
private static final String smallVideo="<div id='youkuplayer' style='width:6.4rem;height:3.2rem'></div><script type='text/javascript' src='http://player.youku.com/jsapi'> </script><script type='text/javascript'>player = new YKU.Player('youkuplayer',{styleid: '0',client_id: 'ec5fe2a0dce21ad3',vid: '###videoAddress###',newPlayer: true}); </script>";
private static final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定義script的正則表示式
private static final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定義style的正則表示式
private static final String regEx_html = "<[^>]+>"; // 定義HTML標籤的正則表示式
private static final String regEx_space = "\\s*|\t|\r|\n";//定義空格回車換行符
//去除行樣式
private static final String remove_style ="style=\"[^>]*?;\""; // 定義style的正則表示式
private static final String remove_width ="width=\"[^>]*?\""; // 定義style的正則表示式
private static final String remove_height ="height=\"[^>]*?\""; // 定義style的正則表示式
/**
* @param htmlStr
* @return
* 刪除Html標籤
*/
public static String delHTMLTag(String htmlStr) {
Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
Matcher m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 過濾script標籤
Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 過濾style標籤
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 過濾html標籤
Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
Matcher m_space = p_space.matcher(htmlStr);
htmlStr = m_space.replaceAll(""); // 過濾空格回車標籤
return htmlStr.trim(); // 返回文字字串
}
//刪除Html標籤
public static String getTextFromHtml(String htmlStr){
htmlStr = delHTMLTag(htmlStr);
htmlStr = htmlStr.replaceAll(" ", "");
return htmlStr;
}
//去除style行樣式
public static String removeStyleHtml(String htmlStr){
Pattern p_style = Pattern.compile(remove_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 過濾style標籤
Pattern p_width = Pattern.compile(remove_width, Pattern.CASE_INSENSITIVE);
Matcher m_width = p_width.matcher(htmlStr);
htmlStr = m_width.replaceAll(""); // 過濾style標籤
Pattern p_height = Pattern.compile(remove_height, Pattern.CASE_INSENSITIVE);
Matcher m_height = p_height.matcher(htmlStr);
htmlStr = m_height.replaceAll(""); // 過濾style標籤
//System.out.println(htmlStr);
return htmlStr;
}
public static List<String> getVideoId(String htmlStr){ //獲取vid
List<String> results = new ArrayList<String>();
Pattern p=Pattern.compile(pc_embed_first+"(.*?)"+pc_embed_end);
Matcher m=p.matcher(htmlStr);
while(!m.hitEnd() && m.find()){
results.add(m.group(1));
}
return results;
}
public static List<String> getEmbedTag(String htmlStr){ //獲取embed標籤
List<String> results = new ArrayList<String>();
Pattern pp = Pattern.compile(embed_tag);
Matcher mp = pp.matcher(htmlStr);
while(!mp.hitEnd() && mp.find()){
results.add(mp.group(0));
}
return results;
}
//處理帶有視訊的新聞內容,,供微官網用
public static String doSmallVideo(String htmlStr){
if(htmlStr.contains(pc_embed_first)){
List<String> videoIds=getVideoId(htmlStr);
List<String> embedTags=getEmbedTag(htmlStr);
if(!videoIds.isEmpty() && !embedTags.isEmpty() && videoIds.size()==embedTags.size()){
for (int i=0;i<embedTags.size();i++){
String tempTag=smallVideo;
tempTag=tempTag.replace("###videoAddress###",videoIds.get(i));
htmlStr=htmlStr.replaceAll(embedTags.get(i),tempTag);
}
}
}
return htmlStr;
}