去掉HTML程式碼 保留文字和圖片
阿新 • • 發佈:2020-11-04
取得HTML中的img
/// <summary> /// 取得HTML中所有圖片的 URL。 /// </summary> /// <param name="sHtmlText">HTML程式碼</param> /// <returns>圖片的URL列表</returns> public static string[] GetHtmlImageUrlList(string sHtmlText) {// 定義正則表示式用來匹配 img 標籤 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // 搜尋匹配的字串 MatchCollection matches = regImg.Matches(sHtmlText);int i = 0; string[] sUrlList = new string[matches.Count]; // 取得匹配項列表 foreach (Match match in matches) sUrlList[i++] = match.Groups["imgUrl"].Value; return sUrlList; }
取得HTML中的文字
/// <summary> /// 取得html中的文字/// </summary> /// <param name="htmlString"></param> /// <returns></returns> public static string NoHTML(string htmlString) { if (string.IsNullOrEmpty(htmlString)) return string.Empty; //刪除指令碼 htmlString = Regex.Replace(htmlString, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //刪除HTML htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&#(\d+);", "", RegexOptions.IgnoreCase); //htmlString = System.Web.HttpUtility.HtmlEncode(htmlString); return htmlString; }