• pachong2


    <!--
        xpath   教程:https://blog.csdn.net/li6727975/article/details/46126079
        解析json教程:https://blog.csdn.net/luxideyao/article/details/77802389
    -->
    <module name="招聘"  type="51job">    
          <!-- 此处 keyword 新闻关键字根据需求设置对应的value-->
          <select>
                <input name="keyword"   type="text" value="java" label="相关关键词,可以职位或公司名称等,和51job官网一样"/>
          </select>
          
          <webSite>https://www.51job.com/</webSite>
          <result>职位,地点,薪资,公司名称,地址,公司性质,规模,分类,招聘要求,发布时间,公司福利,职位信息,公司信息</result>  
          <!-- 有防爬虫,所有需要切换代理IP,但要生效需要对应的套餐使用代理IP -->
          <proxyInfo />
                                  
          <!-- 此引擎所有变量替换规则为: ${变量名} -->
          <operator  name="category" desc="获取总页数">
                <request charset="gbk">
                    <url>http://search.51job.com/list/000000,000000,0000,00,9,99,${keyword},2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= </url>
                    <header>
                        Connection: keep-alive
                        Upgrade-Insecure-Requests: 1
                        User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
                        Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
                        Referer: https://search.51job.com
                        Accept-Encoding: gzip, deflate, br
                        Accept-Language: zh-CN,zh;q=0.9
                    </header>
                    <output>
                        <field name="total_pages" desc="总页数">
                            <parser>//*[@class="p_in"]/span[1]</parser>
                            <script>NumberUtil;getNumber;${total_pages}</script>
                        </field>
                    </output>
                </request>
          </operator>
    
          <operator  name="pagination" desc="分页, pagination为系统命名 ">
                <page  for="1 <= pageNo <= ${total_pages}">
                    <request charset="gbk">
                        <url>http://search.51job.com/list/000000,000000,0000,00,9,99,${keyword},2,${pageNo}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= </url>
                        <header>
                            Connection: keep-alive
                            Upgrade-Insecure-Requests: 1
                            User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
                            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
                            Referer: https://search.51job.com
                            Accept-Encoding: gzip, deflate, br
                            Accept-Language: zh-CN,zh;q=0.9
                        </header>
                        <output>
                            <table for="4 <= i">
                                <field name="listUrl">
                                    <parser>//*[@id="resultList"]/div[${i}]/p/span/a/@href</parser>
                                </field>
                                <field name="发布时间">
                                    <parser>//*[@id="resultList"]/div[${i}]/span[4]</parser>
                                </field>
                            </table>
                        </output>
                    </request>
                </page>
                <criteria>
                    <request charset="gbk" desc="从列表进入爬取详情信息 ">
                        <url>${listUrl}</url>
                        <header>
                            Host: jobs.51job.com
                            Connection: keep-alive
                            Upgrade-Insecure-Requests: 1
                            User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36
                            Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
                            Referer: https://search.51job.com/list/000000,000000,0000,00,9,99,${keyword},2,${pageNo}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
                            Accept-Encoding: gzip, deflate, br
                            Accept-Language: zh-CN,zh;q=0.9
                        </header>
                        <output>
                            <field name="职位">
                                <parser>//*[@class="tHeader tHjob"]/div/div[1]/h1</parser>
                            </field>
                            <field name="地点">
                                <parser>/html/body/div[3]/div[2]/div[2]/div/div[1]/span</parser>
                            </field>
                            <field name="薪资">
                                <parser>/html/body/div[3]/div[2]/div[2]/div/div[1]/strong</parser>
                            </field>
                            <field name="公司名称">
                                <parser>/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a</parser>    
                            </field>
                            <field name="value">
                                <parser>/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]</parser>    
                            </field>
                            <field name="公司性质">
                                <script>"${value}".split("|")[0];</script>    
                            </field>
                            <field name="规模">
                                <script>"${value}".split("|")[1];</script>    
                            </field>
                            <field name="分类">
                                <script>"${value}".split("|")[2];</script>    
                            </field>
                            <field name="招聘要求">
                                <parser>/html/body/div[3]/div[2]/div[3]/div[1]/div/div</parser>
                            </field>
                            <field name="公司福利">
                                <parser>/html/body/div[3]/div[2]/div[3]/div[1]/div/p</parser>    
                            </field>
                            <field name="职位信息">
                                <parser>/html/body/div[3]/div[2]/div[3]/div[2]/div/p[1]</parser>
                            </field>
                            <field name="地址">
                                <parser>/html/body/div[3]/div[2]/div[3]/div[3]/div/p/text()</parser>    
                            </field>
                            <field name="公司信息">
                                <parser>/html/body/div[3]/div[2]/div[3]/div[4]/div/text()[1]</parser>    
                            </field>
                        </output>
                    </request>
                </criteria>
                
          </operator> 
    </module>
  • 相关阅读:
    Oracle Spool详解
    转自云界漫步:同步容灾100公里的限制来自哪里
    中国十大基金公司排行榜
    DataGuard体系结构
    SWIFT国际资金清算系统
    linux -- 命令大全
    bootstrap -- css -- 图片
    bootstrap -- css -- 按钮
    bootstrap -- css -- 表单控件
    bootstrap -- css -- 表单
  • 原文地址:https://www.cnblogs.com/sky-ai/p/9839095.html
Copyright © 2020-2023  润新知