隐藏

C# 正则表达式实例

发布:2020/12/3 16:29:00作者:管理员 来源:本站 浏览次数:908

using HtmlAgilityPack;
using OpenQA.Selenium;
using OpenQA.Selenium.Chrome;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace BgCollection
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string xx = FinalHtml.GetMobileHtml(textBox1.Text, 1);
            //full-content 内容
            //wgt-question-title 标题
            /*
<div class="wgt-question-title">
<h2>甲醛检测一次多少钱</h2>
</div>

            wgt-question-desc-inner
            <div class="wgt-question-desc-inner">
甲醛检测一次多少钱
</div>

             */
            string strContent = xx;
            Regex regex = new Regex("\r\n");
            strContent = regex.Replace(strContent, "");

            //string reg = "<(?<HtmlTag>[\\w]+)[^>]*\\s[class]=(?<Quote>[\"]?)full-content(?(Quote)\\k<Quote>)[\"]?[^>]*>(((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*)</\\k<HtmlTag>>";

            //MatchCollection m = Regex.Matches(strContent, reg, RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.Singleline);


            Regex r = new Regex("<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>");
            StringBuilder sb = new StringBuilder();
            MatchString("<div><div class='123'><div class='234'>234</div></div></div>", r, sb);
            //GetHtmlTagAttr(xx, "full-content", "");

            Regex rg = new Regex("<div[^>]*?class=\"full-content\"[^>]*?>(([^<]*(?(?!</div>)<))*)</div>", RegexOptions.Multiline | RegexOptions.Singleline);
            string _html = rg.Match(xx).Value;


            this.Invoke(new Action(() =>
            {

                richTextBox1.Text += xx + "\n\r";
                richTextBox2.Text += Html2Text(GetValue(xx, "<div class=\"wgt-question-title\">", "</div>"));
                //richTextBox3.Text += GetText(strContent, "full-content");
                //richTextBox3.Text += Html2Text(rg.Match(xx).Value) + "\n\r";//去除html标签
                richTextBox3.Text += GetValue(_html, "<div class=\"full-content\">", "</div>") + "\n\r";

            }));
        }
        public static string Html2Text(string htmlStr)
        {
            if (String.IsNullOrEmpty(htmlStr))
            {
                return "";
            }
            string regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式
            string regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式
            //string regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
            string regEx_html = @"<(?!\/?br\/?.+?>)[^<>]*>"; //去除HTML Tag,但不去除换行标签<br>
            //msg = msg.replace(/<\/? ((? !img).) *?\/?>/ g, ''); //去除HTML Tag,但不去除换行标签<img>(<img />)
            htmlStr = Regex.Replace(htmlStr, regEx_style, "");//删除css
            htmlStr = Regex.Replace(htmlStr, regEx_script, "");//删除js
            htmlStr = Regex.Replace(htmlStr, regEx_html, "");//删除html标记
            //htmlStr = Regex.Replace(htmlStr, "\\s*|\t|\r|\n", "");//去除tab、空格、空行
            return htmlStr.Trim();
        }
        private void MatchString(string OutString, Regex r, StringBuilder sb)
        {
            MatchCollection ms = r.Matches(OutString);// 获取所有的匹配
            foreach (Match m in ms)
            {
                if (m.Success)
                {
                    sb.AppendLine(m.Groups[0].Value);
                    MatchString(m.Groups[0].Value.Substring(1, m.Groups[0].Value.Length - 1), r, sb);// 去掉匹配到的头和尾的 "[" 和 "]",避免陷入死循环递归中,导致溢出
                }
            }
            return;
        }
        private void Form1_Load(object sender, EventArgs e)
        {

        }
        /// <summary>
        /// 获得字符串中开始和结束字符串中间得值
        /// </summary>
        /// <param name="str">字符串</param>
        /// <param name="s">开始</param>
        /// <param name="e">结束</param>
        /// <returns></returns>
        static string GetValue(string str, string s, string e)
        {
            Regex rg = new Regex("(?<=(" + s + "))[.\\s\\S]*?(?=(" + e + "))", RegexOptions.Multiline | RegexOptions.Singleline);
            return rg.Match(str).Value;
        }
        static string GetHtmlValue(string str, string s, string e) {
            Regex rg = new Regex(s + "(.+?)" + e , RegexOptions.Multiline | RegexOptions.Singleline);
            return rg.Match(str).Value;
        }
        static string GetText(string html, string fildname)
        {
            #region 获取内容
            //<div class=\"m\" id=\"sortlist\">(<div[^>]*>(<div[^>]*>(<div[^>]*>.*?</div>|.)*?</div>|.)*?</div>|.)*?</div>
            var _movie_des = "<div(\\s+(title=\"(?<title>[^\"]*?)\"|class=\"(?<class>[^\"]*?)\"|[-\\w]+=\"[^\"]*?\"))*\\s*>(?<text>(.*?))</div>";
            var _maths_5 = Regex.Matches(html, _movie_des);
            var _content = string.Empty;
            var _movie_type = string.Empty;
            for (int ii = 0; ii < _maths_5.Count; ii++)
            {
                var c = _maths_5[ii].Groups["class"].Value;
                if (c.Equals(fildname))
                {
                    _content = _maths_5[ii].Groups["text"].Value;
                    break;
                }

            }
            #endregion
            return _content;
        }
        static string GetDivText(string html, string fildname)
        {
            #region 获取内容
            //<div class=\"m\" id=\"sortlist\">(<div[^>]*>(<div[^>]*>(<div[^>]*>.*?</div>|.)*?</div>|.)*?</div>|.)*?</div>
            var _movie_des = "<div[^>]*>[^<>]*(((?'Open'<div[^>]*>)[^<>]*)+((?'-Open'</div>)[^<>]*)+)*(?(Open)(?!))</div>";
            var _maths_5 = Regex.Matches(html, _movie_des);
            var _content = string.Empty;
            var _movie_type = string.Empty;
            for (int ii = 0; ii < _maths_5.Count; ii++)
            {
                var c = _maths_5[ii].Groups["class"].Value;
                if (c.Equals(fildname))
                {
                    _content = _maths_5[ii].Groups["text"].Value;
                    break;
                }

            }
            #endregion
            return _content;
        }
        static string GetXml(string html, string fildname) {
            int imgNum = 0;//图片编号
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(html);
            string imgPath = "//class";//选择img
            string fileName = ".jpg";
            //HtmlNode nodes = hd.DocumentNode.SelectSingleNode(path);
            //获取img标签中的图片
            foreach (HtmlNode node in doc.DocumentNode.SelectNodes(imgPath))
            {
                if (node.Attributes[fildname] != null)
                {
                    string imgUrl = node.Attributes[fildname].Value.ToString();
                    if (imgUrl != "" && imgUrl != " ")
                    {
                        imgNum++;
                        var file = DateTime.Now.ToString("yyyyMMddhhssmm") + imgNum + fileName;
                        
                       
                    }
                }
            }
            return "";
        }
        /// <summary>  
        /// 获取字符中指定标签的值  
        /// </summary>  
        /// <param name="str">字符串</param>  
        /// <param name="tagName">标签</param>  
        /// <param name="attrib">属性名</param>  
        /// <returns>属性</returns>  
        public static List<string> GetTagAttr(string str, string tagName, string attrib)
        {
            string tmpStr = string.Format("<{0}[^>]*?{1}=(['\"\"]?)(?<url>[^'\"\"\\s>]+)\\1[^>]*>", tagName, attrib);
            //获取<Script>属性值  

            MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);

            List<string> list = new List<string>();
            foreach (Match m in titleMatch)
            {
                string result = m.Groups["url"].Value;
                if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;

                list.Add(result);
            }

            return list;
        }
        /// <summary>  
        /// 获取字符中指定标签的值  
        /// </summary>  
        /// <param name="str">字符串</param>  
        /// <param name="tagName">标签</param>  
        /// <param name="attrib">属性名</param>  
        /// <returns>属性</returns>  
        public static List<string> GetHtmlTagAttr(string str, string classname, string attrib)
        {
            string tmpStr = string.Format("<div[^>]*?class=\"{0}\"[^>]*?>(([^<]*(?(?!</div>)<))*)</div>", classname, attrib);
            //获取<Script>属性值  

            MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);

            List<string> list = new List<string>();
            foreach (Match m in titleMatch)
            {
                string result = m.Groups["text"].Value;
                if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;

                list.Add(result);
            }

            return list;
        }

        /// <summary>  
        /// 获取字符中指定标签的值  
        /// </summary>  
        /// <param name="str">字符串</param>  
        /// <param name="tagName">标签</param>  
        /// <returns>值</returns>  
        public static List<string> GetTagContent(string str, string tagName)
        {
            string tmpStr = string.Format("<{0}[^>]*?>(?<Text>[^<]*)</{1}>", tagName, tagName); //获取<Script>之间内容  

            MatchCollection titleMatch = Regex.Matches(str, tmpStr, RegexOptions.IgnoreCase);

            List<string> list = new List<string>();
            foreach (Match m in titleMatch)
            {
                string result = m.Groups["Text"].Value;
                if (string.IsNullOrEmpty(result) || list.Contains(result)) continue;

                list.Add(result);
            }
            return list;
        }
    }
    /// <summary>
    /// 获得执行过js的网址
    /// </summary>
    public class FinalHtml
    {
        public static string GetMobileHtml(string url, int sectionNum) {
            //设置手机端浏览模式
            var cdSvc = ChromeDriverService.CreateDefaultService();
            cdSvc.HideCommandPromptWindow = true;
            ChromeMobileEmulationDeviceSettings CMEDS = new ChromeMobileEmulationDeviceSettings();
            CMEDS.Width = 320; //设置窗体显示宽高
            CMEDS.Height = 800;
            CMEDS.PixelRatio = 1.0;
            CMEDS.UserAgent = "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25";
            ChromeOptions options = new ChromeOptions();
            options.AddArguments("lang=zh_CN.UTF-8");
            //不显示浏览器,无头模式
            options.AddArgument("--headless");
            options.EnableMobileEmulation(CMEDS);
            // 禁用图片
            options.AddUserProfilePreference("profile.default_content_setting_values.images", 2);
            // GPU加速可能会导致Chrome出现黑屏及CPU占用率过高,所以禁用
            options.AddArgument("--disable-gpu");

            IWebDriver driver = new ChromeDriver(cdSvc, options);
            driver.Navigate().GoToUrl(url);
            string title = driver.Title;
            Console.WriteLine($"Title: {title}");
            //将页面滚动到底部
            Console.Write("页面滚动中,请稍后");

            for (int i = 1; i <= sectionNum; i++)
            {
                string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});";
                IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
                js.ExecuteScript(jsCode);
                Console.Write(".");
                Thread.Sleep(1000);
            }
            Console.WriteLine();

            string html = driver.PageSource;
            driver.Quit();


            return html;
        }
        /// <summary>
        /// 获得拉动滚动条后的页面
        /// </summary>
        /// <param name="url">网址</param>
        /// <param name="sectionNum">滚动几次</param>
        /// <returns>html字符串</returns>
        public static string GetFinalHtml(string url, int sectionNum)
        {
            //不启动chrome窗口
            ChromeOptions options = new ChromeOptions();
            options.AddArgument("headless");

            //关闭ChromeDriver控制台
            ChromeDriverService driverService = ChromeDriverService.CreateDefaultService();
            driverService.HideCommandPromptWindow = true;


            ChromeDriver driver = new ChromeDriver(driverService, options);

            driver.Navigate().GoToUrl(url);

            string title = driver.Title;
            Console.WriteLine($"Title: {title}");
            //将页面滚动到底部
            Console.Write("页面滚动中,请稍后");

            for (int i = 1; i <= sectionNum; i++)
            {
                string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});";
                IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
                js.ExecuteScript(jsCode);
                Console.Write(".");
                Thread.Sleep(1000);
            }
            Console.WriteLine();

            string html = driver.PageSource;
            driver.Quit();






            return html;
        }

        /// <summary>
        /// Get请求
        /// </summary>
        /// <param name="url"></param>
        /// <param name="sectionNum"></param>
        /// <returns></returns>
        public static async Task<string> GetFinalHtmlAsync(string url, int sectionNum)
        {
            Task<string> task = Task<string>.Run(() =>
            {
                //不启动chrome窗口
                ChromeOptions options = new ChromeOptions();
                options.AddArgument("headless");

                //关闭ChromeDriver控制台
                ChromeDriverService driverService = ChromeDriverService.CreateDefaultService();
                driverService.HideCommandPromptWindow = true;


                ChromeDriver driver = new ChromeDriver(driverService, options);

                driver.Navigate().GoToUrl(url);

                string title = driver.Title;
                Console.WriteLine($"Title: {title}");
                //将页面滚动到底部
                Console.Write("页面滚动中,请稍后");

                for (int i = 1; i <= sectionNum; i++)
                {
                    string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});";
                    IJavaScriptExecutor js = (IJavaScriptExecutor)driver;
                    js.ExecuteScript(jsCode);
                    Console.Write(".");
                    Thread.Sleep(1000);
                }
                Console.WriteLine();

                string html = driver.PageSource;
                driver.Quit();
                return html;
            });
            return await task;
        }


    }
}