1. 程式人生 > 實用技巧 >去掉HTML程式碼 保留文字和圖片

去掉HTML程式碼 保留文字和圖片

取得HTML中的img

        /// <summary>   
        /// 取得HTML中所有圖片的 URL。   
        /// </summary>   
        /// <param name="sHtmlText">HTML程式碼</param>   
        /// <returns>圖片的URL列表</returns>   
        public static string[] GetHtmlImageUrlList(string sHtmlText)
        {
            
// 定義正則表示式用來匹配 img 標籤 Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // 搜尋匹配的字串 MatchCollection matches = regImg.Matches(sHtmlText);
int i = 0; string[] sUrlList = new string[matches.Count]; // 取得匹配項列表 foreach (Match match in matches) sUrlList[i++] = match.Groups["imgUrl"].Value; return sUrlList; }

取得HTML中的文字

 /// <summary>
        /// 取得html中的文字
        
/// </summary> /// <param name="htmlString"></param> /// <returns></returns> public static string NoHTML(string htmlString) { if (string.IsNullOrEmpty(htmlString)) return string.Empty; //刪除指令碼 htmlString = Regex.Replace(htmlString, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //刪除HTML htmlString = Regex.Replace(htmlString, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"-->", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"<!--.*", "", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); htmlString = Regex.Replace(htmlString, @"&#(\d+);", "", RegexOptions.IgnoreCase); //htmlString = System.Web.HttpUtility.HtmlEncode(htmlString); return htmlString; }