C#制作网盘搜索工具(简单的爬虫)
阅读原文时间:2023年07月10日阅读:2

最近学习C#编程,在网上发现一篇winform下制作百度网盘搜索器的文章,故而下载源码学习一二。无奈原博所用的网址失效,故而自己改写了网址和相关源代码,也进行了实现。因为初学,接触的知识较多,为免忘记,进行整理复习。

1.知识点:

思路:主要是利用HttpWebRequest,HttpWebResponse进行http模拟请求,然后利用HtmlAgilityPack+XPath语法对html dom进行元素获取,将截取到的相关内容在datagridview中展示,最后利用process.start()方法进行点击访问。

2.具体实现:

2.1关于请求头的获取:

本例子使用网址为:http://www.pansoso.com/

分析上述网址的请求头进行模拟:

查看具体请求头信息:

根据获取的request url分析出其请求地址的规律为:所搜索的关键字:hello直接利用get方法添加到了url的最后,其中页数规律为hello_1,hello_2。。。(每页十条记录)

2.2关于结果的获取:

结果的获取,直接利用对response网页的分析截取关键信息即可。

3.代码实现:

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;

namespace 百度网盘资源搜索
{
class HttpHelper
{

    static readonly string urlTemplate = "http://www.pansoso.com/zh/{0}";
    public static string Requset(string key)
    {
        string url = string.Format(urlTemplate, key);
        //Console.WriteLine(url);
        HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(url);
        httpRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
        httpRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
        httpRequest.Host = "www.pansoso.com";
        httpRequest.Referer = "http://www.pansoso.com/zh/" + Uri.EscapeUriString(key);
        try
        {
            HttpWebResponse httpResponse = (HttpWebResponse)httpRequest.GetResponse();
            Stream s = httpResponse.GetResponseStream();
            StreamReader sr = new StreamReader(s);
            string jsonString = sr.ReadToEnd();

            //Console.WriteLine(jsonString);
            //string jsonProcessed = null;
            //if ((jsonProcessed = JsonPreProcessing(jsonString)) != null)
            //{
             //  SearchResult searchResult = UtilityClass.GetObject<SearchResult>(jsonProcessed);
            //    return searchResult;
            //}
            return jsonString;
        }
        catch
        {
            return null;
        }
    }
    public static SearchResult dodata(string str)
    {
        SearchResult searchResult = UtilityClass.GetObject<SearchResult>(str);
        return searchResult;

    }

        //if (doc.DocumentNode.SelectNodes("//comment()") != null)
        //{
        //    foreach (var commet in doc.DocumentNode.SelectNodes("//comment"))
        //    {
        //        commet.Remove();
        //    }
        //}

    public static string JsonPreProcessing(string jsonString)
    {
        int startIndex = jsonString.IndexOf("(");
        if (startIndex > 0)
        {
            string json = jsonString.Substring(startIndex + 1);
            return "{\"resources\":" + json.Remove(json.Length - 3) + "}";
        }
        else
        {
            return null;
        }
    }
}
}

Utility.Class

using System;
using System.Collections.Generic;
using System.IO;
//using System.Linq;
using System.Runtime.Serialization.Json;
using System.Text;

namespace 百度网盘资源搜索
{
    class UtilityClass
    {
    public static T GetObject<T>(string json)
    {
        DataContractJsonSerializer serializer = new DataContractJsonSerializer(typeof(T));
        MemoryStream ms = new MemoryStream(Encoding.UTF8.GetBytes(json));
        T obj = (T)serializer.ReadObject(ms);
        return obj;
    }
    }
}

JSontoObject.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace 百度网盘资源搜索
{
public class SearchResult
{
    public BDWPResource[] resources { get; set; }
}

public class BDWPResource
{
    public string title { get; set; }
    public string content { get; set; }
    public string unescapedUrl { get; set; }
}

}

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Threading;
using System.Diagnostics;

namespace 百度网盘资源搜索
{//主窗体
    public partial class FrmMain : Form
    {
        bool isSearch = true;
        string url = "http://www.pansoso.com";
        public FrmMain()
        {
            InitializeComponent();
        }

        private void btnSearch_Click(object sender, EventArgs e)
        {

            string key = this.txtKey.Text;
            if (!string.IsNullOrEmpty(key))
            {
                this.dataGridView1.Rows.Clear();
                this.lblResult.Text = "0";
                this.pgsBar.Value = 0;
                this.btnSearch.Text = "正在搜索";
                this.btnSearch.Enabled = false;
                this.btnStop.Enabled = true;
                Thread thread = new Thread(() =>
                {
                    for (int i = 1; i < 11; i ++)//共取得10页网页数据
                    {
                        if (isSearch)
                        {

                                gethtml(HttpHelper.Requset(key+"_"+i));

                           //gethtml(HttpHelper.Requset(key));
                           //if(textBox1.Text!=null)
                           //{
                           //    string name=textBox1.Text;
                           //   SearchResult sr= HttpHelper.dodata(name);
                           //   if (sr != null)
                           //   {
                           //       foreach (BDWPResource resource in sr.resources)
                           //       {
                           //           BindResource(resource);
                           //       }
                         //   }
                          // }
                           // webBrowser1.DocumentText = HttpHelper.Requset(key);
                                // Navigate to HTML document string
                                //webBrowser1.Navigate(HttpHelper.Requset(key));

                           // SearchResult sr = HttpHelper.Requset(key);

                        }
                        else break;
                    }
                    //搜索完成
                    SearchOver();

                });
                thread.IsBackground = true;
                thread.Start();
            }
        }

        public void gethtml(string docs)
        {
            try
            {
                  HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(docs);
            if (doc.DocumentNode.SelectNodes("//script") != null)
            {
                foreach (var script in doc.DocumentNode.SelectNodes("//script"))
                {
                    script.Remove();
                }
                  HtmlAgilityPack.HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//h2/a[@href]");
            HtmlAgilityPack.HtmlNodeCollection list2 = doc.DocumentNode.SelectNodes(".//div[@class='des']");
            HtmlAgilityPack.HtmlNodeCollection list3 = doc.DocumentNode.SelectNodes(".//h2/a[@href]");
            if (hrefList != null && list2 != null && list3 != null)
            {

                for (int i = 0; i < list2.Count; i++)
                {
                   string url1 = url + list3[i].Attributes["href"].Value;
                   string json = "title:" + hrefList[i].InnerText + "content:" + list2[i].InnerText + "unescapedUrl:" +"【"+url1+"】" ;
                   // Process.Start(url1);
                    SearchOver1(json);
                    this.Invoke(new Action<string, string, string>((tle, ctt, url3) =>
                    {
                        this.dataGridView1.Rows.Add(tle, ctt, url3);
                        this.lblResult.Text = (Int32.Parse(this.lblResult.Text) + 1).ToString();
                        if (this.pgsBar.Value < this.pgsBar.Maximum)
                        {
                            this.pgsBar.Value++;
                        }
                    }), hrefList[i].InnerText,list2[i].InnerText, url1);
                }

            }
            }
            }

            catch (Exception)
            {

                MessageBox.Show("该关键字没有收录资源!!!");
            }

            }
            //if (doc.DocumentNode.SelectNodes("//style") != null)
            //{
            //    foreach (var style in doc.DocumentNode.SelectNodes("style"))
            //    {
            //        style.Remove();
            //    }
            //}

        private void BindResource(BDWPResource resource)
        {
            string title = resource.title.Replace("</b>", "").Replace("<b>", "");
            string content = resource.content.Replace("</b>", "").Replace("<b>", "");

            this.Invoke(new Action<string, string, string>((tle, ctt, url) =>
            {
                this.dataGridView1.Rows.Add(tle, ctt, url);
                this.lblResult.Text = (Int32.Parse(this.lblResult.Text) + 1).ToString();
                if (this.pgsBar.Value < this.pgsBar.Maximum)
                {
                    this.pgsBar.Value++;
                }
            }), title, content, resource.unescapedUrl);
        }

        private void SearchOver()
        {
            this.Invoke(new Action(() =>
            {
                this.btnSearch.Text = "开始搜索";
                this.btnSearch.Enabled = true;
                this.btnStop.Enabled = false;
                this.isSearch = true;
            }));
        }
        public void SearchOver1(string str)
        {
            this.Invoke(new Action(() =>
            {
                this.richTextBox1.Text += str + System.Environment.NewLine;

            }));
        }
        private void dataGridView1_RowPostPaint(object sender, DataGridViewRowPostPaintEventArgs e)
        {
            SolidBrush b = new SolidBrush(this.dataGridView1.RowHeadersDefaultCellStyle.ForeColor);
            e.Graphics.DrawString((e.RowIndex + 1).ToString(System.Globalization.CultureInfo.CurrentUICulture), this.dataGridView1.DefaultCellStyle.Font, b, e.RowBounds.Location.X + 20, e.RowBounds.Location.Y + 6);
            e.Graphics.FillRectangle(Brushes.White, new Rectangle(new Point(e.RowBounds.Location.X + 2, e.RowBounds.Location.Y + 2), new Size(20, 20)));//隐藏每行前面的图标
        }

        //打开网页链接
        private void dataGridView1_CellDoubleClick(object sender, DataGridViewCellEventArgs e)
        {
            if (e.RowIndex > -1)
            {
                string url = this.dataGridView1.Rows[e.RowIndex].Cells[2].Value.ToString();
                Process.Start(url);//进行打开浏览器的方法。
            }
        }

        private void btnStop_Click(object sender, EventArgs e)
        {
            isSearch = false;
            this.btnSearch.Enabled = true;
        }

        private void richTextBox1_LinkClicked(object sender, LinkClickedEventArgs e)
        {
            System.Diagnostics.Process.Start(e.LinkText);
        }
    }
}

4.效果实现: