• Flink之热门页面统计


    1、数据格式

    83.149.9.123 - - 17/05/2020:10:05:03 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-search.png
    83.149.9.123 - - 17/05/2020:10:05:43 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard3.png
    83.149.9.123 - - 17/05/2020:10:05:47 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/highlight/highlight.js
    83.149.9.123 - - 17/05/2020:10:05:12 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/zoom-js/zoom.js
    83.149.9.123 - - 17/05/2020:10:05:07 +0000 GET /presentations/logstash-kafkamonitor-2020/plugin/notes/notes.js
    83.149.9.123 - - 17/05/2020:10:05:34 +0000 GET /presentations/logstash-kafkamonitor-2020/images/sad-medic.png
    83.149.9.123 - - 17/05/2020:10:05:57 +0000 GET /presentations/logstash-kafkamonitor-2020/css/fonts/Roboto-Bold.ttf
    83.149.9.123 - - 17/05/2020:10:05:50 +0000 GET /presentations/logstash-kafkamonitor-2020/css/fonts/Roboto-Regular.ttf
    83.149.9.123 - - 17/05/2020:10:05:24 +0000 GET /presentations/logstash-kafkamonitor-2020/images/frontend-response-codes.png
    83.149.9.123 - - 17/05/2020:10:05:50 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard.png
    83.149.9.123 - - 17/05/2020:10:05:46 +0000 GET /presentations/logstash-kafkamonitor-2020/images/Dreamhost_logo.svg
    83.149.9.123 - - 17/05/2020:10:05:11 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard2.png
    83.149.9.123 - - 17/05/2020:10:05:19 +0000 GET /presentations/logstash-kafkamonitor-2020/images/apache-icon.gif
    83.149.9.123 - - 17/05/2020:10:05:33 +0000 GET /presentations/logstash-kafkamonitor-2020/images/nagios-sms5.png
    83.149.9.123 - - 17/05/2020:10:05:00 +0000 GET /presentations/logstash-kafkamonitor-2020/images/redis.png
    83.149.9.123 - - 17/05/2020:10:05:25 +0000 GET /presentations/logstash-kafkamonitor-2020/images/elasticsearch.png
    83.149.9.123 - - 17/05/2020:10:05:59 +0000 GET /presentations/logstash-kafkamonitor-2020/images/logstashbook.png
    83.149.9.123 - - 17/05/2020:10:05:30 +0000 GET /presentations/logstash-kafkamonitor-2020/images/github-contributions.png
    83.149.9.123 - - 17/05/2020:10:05:53 +0000 GET /presentations/logstash-kafkamonitor-2020/css/print/paper.css
    83.149.9.123 - - 17/05/2020:10:05:24 +0000 GET /presentations/logstash-kafkamonitor-2020/images/1983_delorean_dmc-12-pic-38289.jpeg
    83.149.9.123 - - 17/05/2020:10:05:54 +0000 GET /presentations/logstash-kafkamonitor-2020/images/simple-inputs-filters-outputs.jpg
    83.149.9.123 - - 17/05/2020:10:05:33 +0000 GET /presentations/logstash-kafkamonitor-2020/images/tiered-outputs-to-inputs.jpg
    83.149.9.123 - - 17/05/2020:10:05:56 +0000 GET /favicon.ico
    24.236.252.67 - - 17/05/2020:10:05:40 +0000 GET /favicon.ico
    93.114.45.13 - - 17/05/2020:10:05:14 +0000 GET /articles/dynamic-dns-with-dhcp/
    93.114.45.13 - - 17/05/2020:10:05:04 +0000 GET /reset.css
    93.114.45.13 - - 17/05/2020:10:05:45 +0000 GET /style2.css
    93.114.45.13 - - 17/05/2020:10:05:14 +0000 GET /favicon.ico
    93.114.45.13 - - 17/05/2020:10:05:17 +0000 GET /images/jordan-80.png
    93.114.45.13 - - 17/05/2020:10:05:21 +0000 GET /images/web/2009/banner.png
    66.249.73.135 - - 17/05/2020:10:05:40 +0000 GET /blog/tags/ipv6
    50.16.19.13 - - 17/05/2020:10:05:10 +0000 GET /blog/tags/puppet?flav=rss20
    66.249.73.185 - - 17/05/2020:10:05:37 +0000 GET /
    110.136.166.128 - - 17/05/2020:10:05:35 +0000 GET /projects/xdotool/
    46.105.14.53 - - 17/05/2020:10:05:03 +0000 GET /blog/tags/puppet?flav=rss20
    110.136.166.128 - - 17/05/2020:10:05:06 +0000 GET /reset.css
    110.136.166.128 - - 17/05/2020:10:05:03 +0000 GET /style2.css
    110.136.166.128 - - 17/05/2020:10:05:41 +0000 GET /favicon.ico
    110.136.166.128 - - 17/05/2020:10:05:32 +0000 GET /images/jordan-80.png
    123.125.71.35 - - 17/05/2020:10:05:46 +0000 GET /blog/tags/release
    110.136.166.128 - - 17/05/2020:10:05:08 +0000 GET /images/web/2009/banner.png
    50.150.204.184 - - 17/05/2020:10:05:46 +0000 GET /images/googledotcom.png
    207.241.237.225 - - 17/05/2020:10:05:58 +0000 GET /blog/tags/examples
    200.49.190.101 - - 17/05/2020:10:05:36 +0000 GET /reset.css
    200.49.190.100 - - 17/05/2020:10:05:38 +0000 GET /blog/tags/web
    200.49.190.101 - - 17/05/2020:10:05:11 +0000 GET /style2.css
    200.49.190.101 - - 17/05/2020:10:05:37 +0000 GET /images/jordan-80.png
    66.249.73.185 - - 17/05/2020:10:05:00 +0000 GET /reset.css
    66.249.73.135 - - 17/05/2020:10:05:16 +0000 GET /blog/tags/munin
    66.249.73.135 - - 17/05/2020:10:05:33 +0000 GET /blog/tags/firefox?flav=rss20
    66.249.73.135 - - 17/05/2020:10:05:17 +0000 GET /blog/geekery/eventdb-ideas.html
    67.214.178.190 - - 17/05/2020:10:05:48 +0000 GET /
    67.214.178.190 - - 17/05/2020:10:05:18 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
    207.241.237.220 - - 17/05/2020:10:05:28 +0000 GET /blog/tags/projects
    46.105.14.53 - - 17/05/2020:10:05:44 +0000 GET /blog/tags/puppet?flav=rss20
    207.241.237.227 - - 17/05/2020:10:05:47 +0000 GET /blog/geekery/soekris-gpio.html
    91.177.205.119 - - 17/05/2020:10:05:22 +0000 GET /blog/geekery/xvfb-firefox.html
    91.177.205.119 - - 17/05/2020:10:05:34 +0000 GET /reset.css
    91.177.205.119 - - 17/05/2020:10:05:37 +0000 GET /style2.css
    91.177.205.119 - - 17/05/2020:10:05:54 +0000 GET /images/jordan-80.png
    91.177.205.119 - - 17/05/2020:10:05:31 +0000 GET /images/web/2009/banner.png
    91.177.205.119 - - 17/05/2020:10:05:32 +0000 GET /favicon.ico
    66.249.73.185 - - 17/05/2020:10:05:22 +0000 GET /doc/index.html?org/elasticsearch/action/search/SearchResponse.html
    207.241.237.228 - - 17/05/2020:10:05:40 +0000 GET /blog/tags/defcon
    207.241.237.101 - - 17/05/2020:10:05:51 +0000 GET /blog/tags/regex
    87.169.99.232 - - 17/05/2020:10:05:59 +0000 GET /presentations/puppet-at-loggly/puppet-at-loggly.pdf.html
    209.85.238.199 - - 17/05/2020:10:05:30 +0000 GET /blog/tags/firefox?flav=rss20
    209.85.238.199 - - 17/05/2020:10:05:15 +0000 GET /test.xml
    81.220.24.207 - - 17/05/2020:10:05:13 +0000 GET /blog/geekery/ssl-latency.html
    81.220.24.207 - - 17/05/2020:10:05:44 +0000 GET /reset.css
    81.220.24.207 - - 17/05/2020:10:05:26 +0000 GET /images/jordan-80.png
    81.220.24.207 - - 17/05/2020:10:05:39 +0000 GET /style2.css
    81.220.24.207 - - 17/05/2020:10:05:52 +0000 GET /images/web/2009/banner.png
    81.220.24.207 - - 17/05/2020:10:05:21 +0000 GET /favicon.ico
    66.249.73.135 - - 17/05/2020:11:05:17 +0000 GET /blog/geekery/vmware-cpu-performance.html
    46.105.14.53 - - 17/05/2020:11:05:42 +0000 GET /blog/tags/puppet?flav=rss20
    218.30.103.62 - - 17/05/2020:11:05:11 +0000 GET /robots.txt
    218.30.103.62 - - 17/05/2020:11:05:46 +0000 GET /robots.txt
    218.30.103.62 - - 17/05/2020:11:05:45 +0000 GET /projects/fex/
    74.125.40.20 - - 17/05/2020:11:05:59 +0000 GET /?flav=rss20
    71.212.224.97 - - 17/05/2020:11:05:05 +0000 GET /projects/xdotool/
    71.212.224.97 - - 17/05/2020:11:05:15 +0000 GET /reset.css
    71.212.224.97 - - 17/05/2020:11:05:22 +0000 GET /style2.css
    71.212.224.97 - - 17/05/2020:11:05:11 +0000 GET /images/jordan-80.png
    71.212.224.97 - - 17/05/2020:11:05:28 +0000 GET /images/web/2009/banner.png
    218.30.103.62 - - 17/05/2020:11:05:17 +0000 GET /projects/xdotool/xdotool.xhtml
    108.174.55.234 - - 17/05/2020:11:05:26 +0000 GET /?flav=rss20
    218.30.103.62 - - 17/05/2020:11:05:37 +0000 GET /blog/geekery/c-vs-python-bdb.html
    121.107.188.202 - - 17/05/2020:11:05:09 +0000 GET /presentations/logstash-kafkamonitor-2020/images/kibana-dashboard3.png
    218.30.103.62 - - 17/05/2020:11:05:39 +0000 GET /blog/productivity/better-zsh-xterm-title-fix.html
    218.30.103.62 - - 17/05/2020:11:05:11 +0000 GET /blog/geekery/xvfb-firefox.html
    218.30.103.62 - - 17/05/2020:11:05:00 +0000 GET /blog/geekery/puppet-facts-into-mcollective.html
    198.46.149.143 - - 17/05/2020:11:05:10 +0000 GET /blog/geekery/disabling-battery-in-ubuntu-vms.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
    198.46.149.143 - - 17/05/2020:11:05:48 +0000 GET /blog/geekery/solving-good-or-bad-problems.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
    218.30.103.62 - - 17/05/2020:11:05:28 +0000 GET /blog/geekery/jquery-interface-puffer.html%20target=
    218.30.103.62 - - 17/05/2020:11:05:05 +0000 GET /blog/geekery/ec2-reserved-vs-ondemand.html
    66.249.73.135 - - 17/05/2020:11:05:31 +0000 GET /blog/web/firefox-scrolling-fix.html
    86.1.76.62 - - 17/05/2020:11:05:36 +0000 GET /projects/xdotool/
    86.1.76.62 - - 17/05/2020:11:05:25 +0000 GET /reset.css
    86.1.76.62 - - 17/05/2020:11:05:19 +0000 GET /style2.css
    86.1.76.62 - - 17/05/2020:11:05:03 +0000 GET /favicon.ico
    86.1.76.62 - - 17/05/2020:11:05:28 +0000 GET /images/jordan-80.png
    86.1.76.62 - - 17/05/2020:11:05:07 +0000 GET /images/web/2009/banner.png
    66.249.73.135 - - 17/05/2020:11:05:58 +0000 GET /blog/tags/bdb
    107.170.41.69 - - 17/05/2020:11:05:31 +0000 GET /?flav=atom
    50.16.19.13 - - 17/05/2020:11:05:14 +0000 GET /blog/tags/puppet?flav=rss20
    46.105.14.53 - - 17/05/2020:11:05:02 +0000 GET /blog/tags/puppet?flav=rss20
    208.115.111.72 - - 17/05/2020:11:05:26 +0000 GET /blog/rants/fedora-yum.html
    208.115.111.72 - - 17/05/2020:11:05:32 +0000 GET /blog/tags/grok
    208.115.111.72 - - 17/05/2020:11:05:12 +0000 GET /blog/tags/is%20it%20done%20yet
    208.115.111.72 - - 17/05/2020:11:05:07 +0000 GET /blog/tags/statistics
    50.180.79.170 - - 17/05/2020:11:05:50 +0000 GET /favicon.ico
    208.115.111.72 - - 17/05/2020:11:05:05 +0000 GET /blog/tags/subversion
    208.115.111.72 - - 17/05/2020:11:05:52 +0000 GET /blog/web/194.html
    208.115.111.72 - - 17/05/2020:11:05:23 +0000 GET /files/blogposts/20070901/?C=D;O=A
    208.115.111.72 - - 17/05/2020:11:05:15 +0000 GET /files/blogposts/20080109/boost_xpressive_test.cpp
    208.115.111.72 - - 17/05/2020:11:05:38 +0000 GET /files/blogposts/20090520/
    208.115.111.72 - - 17/05/2020:11:05:41 +0000 GET /files/fastsplit/?C=M;O=D
    208.115.111.72 - - 17/05/2020:11:05:19 +0000 GET /files/xdotool/docs/man/?C=M;O=D
    208.115.111.72 - - 17/05/2020:11:05:16 +0000 GET /scripts/python/wrap/?C=N;O=D
    208.115.111.72 - - 17/05/2020:11:05:32 +0000 GET /files/images/?C=S;O=D
    208.115.111.72 - - 17/05/2020:11:05:00 +0000 GET /files/blogposts/20080611/
    208.115.111.72 - - 17/05/2020:11:05:16 +0000 GET /files/logstash/?C=D;O=D
    208.115.111.72 - - 17/05/2020:11:05:53 +0000 GET /presentations/hackday06/
    208.115.111.72 - - 17/05/2020:11:05:29 +0000 GET /scripts/grok-py-test/
    208.115.111.72 - - 17/05/2020:11:05:08 +0000 GET /?N=A&page=21
    208.115.111.72 - - 17/05/2020:11:05:49 +0000 GET /blog/geekery/oniguruma-named-capture-example.html?commentlimit=0
    208.115.111.72 - - 17/05/2020:11:05:01 +0000 GET /blog/geekery/ssh-key-invalid-hack.html?commentlimit=0
    208.115.111.72 - - 17/05/2020:11:05:31 +0000 GET /blog/geekery/server-side-javascript.html
    208.115.111.72 - - 17/05/2020:11:05:15 +0000 GET /blog/geekery/yahoo-hackday-08.html
    105.235.130.196 - - 17/05/2020:11:05:01 +0000 GET /images/googledotcom.png
    174.37.205.76 - - 17/05/2020:11:05:19 +0000 GET /blog
    54.255.13.204 - - 17/05/2020:11:05:03 +0000 GET /articles/ssh-security/
    105.235.130.196 - - 17/05/2020:11:05:45 +0000 GET /blog/tags/X11
    54.255.13.204 - - 17/05/2020:11:05:55 +0000 GET /reset.css
    54.255.13.204 - - 17/05/2020:11:05:32 +0000 GET /style2.css
    54.255.13.204 - - 17/05/2020:11:05:10 +0000 GET /favicon.ico
    105.235.130.196 - - 17/05/2020:11:05:20 +0000 GET /reset.css
    54.255.13.204 - - 17/05/2020:11:05:46 +0000 GET /images/jordan-80.png
    54.255.13.204 - - 17/05/2020:11:05:17 +0000 GET /images/web/2009/banner.png
    105.235.130.196 - - 17/05/2020:11:05:47 +0000 GET /style2.css
    105.235.130.196 - - 17/05/2020:11:05:37 +0000 GET /images/jordan-80.png
    105.235.130.196 - - 17/05/2020:11:05:22 +0000 GET /images/web/2009/banner.png
    134.76.249.10 - - 17/05/2020:11:05:01 +0000 GET /projects/xdotool/
    134.76.249.10 - - 17/05/2020:11:05:09 +0000 GET /reset.css
    134.76.249.10 - - 17/05/2020:11:05:57 +0000 GET /style2.css
    134.76.249.10 - - 17/05/2020:11:05:23 +0000 GET /favicon.ico
    134.76.249.10 - - 17/05/2020:11:05:40 +0000 GET /images/jordan-80.png
    134.76.249.10 - - 17/05/2020:11:05:50 +0000 GET /images/web/2009/banner.png
    134.76.249.10 - - 17/05/2020:11:05:47 +0000 GET /projects/xdotool
    134.76.249.10 - - 17/05/2020:11:05:13 +0000 GET /projects/xdotool/
    66.249.73.135 - - 17/05/2020:11:05:26 +0000 GET /?flav=atom
    207.241.237.220 - - 17/05/2020:11:05:24 +0000 GET /blog/tags/C?page=2
    68.184.202.186 - - 17/05/2020:11:05:28 +0000 GET /projects/xpathtool/
    68.184.202.186 - - 17/05/2020:11:05:02 +0000 GET /reset.css
    68.184.202.186 - - 17/05/2020:11:05:05 +0000 GET /images/jordan-80.png
    68.184.202.186 - - 17/05/2020:11:05:02 +0000 GET /style2.css
    68.184.202.186 - - 17/05/2020:11:05:37 +0000 GET /images/web/2009/banner.png
    68.184.202.186 - - 17/05/2020:11:05:58 +0000 GET /favicon.ico
    46.105.14.53 - - 17/05/2020:11:05:29 +0000 GET /blog/tags/puppet?flav=rss20
    66.249.73.135 - - 17/05/2020:11:05:00 +0000 GET /?flav=rss20
    24.233.162.179 - - 17/05/2020:11:05:31 +0000 GET /favicon.ico
    123.125.71.117 - - 17/05/2020:11:05:16 +0000 GET /
    220.181.108.153 - - 17/05/2020:11:05:09 +0000 GET /
    65.19.138.34 - - 17/05/2020:11:05:40 +0000 GET /
    66.249.73.135 - - 17/05/2020:11:05:32 +0000 GET /blog/geekery/rhapsody-on-linux.html
    97.116.185.190 - - 17/05/2020:11:05:59 +0000 GET /articles/dynamic-dns-with-dhcp/
    97.116.185.190 - - 17/05/2020:11:05:39 +0000 GET /reset.css
    97.116.185.190 - - 17/05/2020:11:05:29 +0000 GET /style2.css
    97.116.185.190 - - 17/05/2020:11:05:39 +0000 GET /images/jordan-80.png
    97.116.185.190 - - 17/05/2020:11:05:02 +0000 GET /images/web/2009/banner.png
    97.116.185.190 - - 17/05/2020:11:05:35 +0000 GET /favicon.ico
    5.255.72.168 - - 17/05/2020:11:05:21 +0000 GET /
    5.255.72.168 - - 17/05/2020:11:05:08 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
    46.105.14.53 - - 17/05/2020:11:05:33 +0000 GET /blog/tags/puppet?flav=rss20
    5.102.173.71 - - 17/05/2020:11:05:13 +0000 GET /robots.txt
    5.102.173.71 - - 17/05/2020:11:05:06 +0000 GET /projects/xdotool/
    208.91.156.11 - - 17/05/2020:11:05:05 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar
    66.249.73.185 - - 17/05/2020:11:05:58 +0000 GET /presentations/logstash-1/
    74.125.176.81 - - 17/05/2020:11:05:28 +0000 GET /?flav=rss20
    66.249.73.135 - - 17/05/2020:11:05:14 +0000 GET /blog/geekery/xdotool-2.20110530.html
    187.45.193.158 - - 17/05/2020:11:05:54 +0000 GET /presentations/logstash-1/file/about-me/tequila-face.jpg
    90.220.199.149 - - 17/05/2020:11:05:18 +0000 GET /blog/geekery/puppet-manage-homedirectory-contents.html
    90.220.199.149 - - 17/05/2020:11:05:24 +0000 GET /reset.css
    90.220.199.149 - - 17/05/2020:11:05:50 +0000 GET /style2.css
    90.220.199.149 - - 17/05/2020:12:05:37 +0000 GET /images/jordan-80.png
    90.220.199.149 - - 17/05/2020:12:05:21 +0000 GET /images/web/2009/banner.png
    90.220.199.149 - - 17/05/2020:12:05:17 +0000 GET /favicon.ico
    36.38.8.174 - - 17/05/2020:12:05:24 +0000 GET /blog/geekery/ssl-latency.html
    36.38.8.174 - - 17/05/2020:12:05:36 +0000 GET /reset.css
    36.38.8.174 - - 17/05/2020:12:05:14 +0000 GET /style2.css
    36.38.8.174 - - 17/05/2020:12:05:44 +0000 GET /images/jordan-80.png
    36.38.8.174 - - 17/05/2020:12:05:17 +0000 GET /images/web/2009/banner.png
    36.38.8.174 - - 17/05/2020:12:05:39 +0000 GET /favicon.ico
    71.207.12.53 - - 17/05/2020:12:05:17 +0000 GET /favicon.ico
    220.241.45.142 - - 17/05/2020:12:05:07 +0000 GET /robots.txt
    220.241.45.142 - - 17/05/2020:12:05:30 +0000 GET /projects/firefox-tabsearch/
    209.85.238.199 - - 17/05/2020:12:05:21 +0000 GET /?flav=atom
    46.105.14.53 - - 17/05/2020:12:05:53 +0000 GET /blog/tags/puppet?flav=rss20
    66.249.73.135 - - 17/05/2020:12:05:28 +0000 GET /blog/tags/noise
    View Code

    2、处理主类

    package service
    
    /**
     * @program: demo
     * @description: ${description}
     * @author: yang
     * @create: 2020-12-30 14:28
     */
    
    import java.sql.Timestamp
    
    import org.apache.flink.api.common.functions.AggregateFunction
    import org.apache.flink.api.common.state.{MapState, MapStateDescriptor}
    import org.apache.flink.streaming.api.TimeCharacteristic
    import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, KeyedProcessFunction}
    import org.apache.flink.streaming.api.scala._
    import org.apache.flink.streaming.api.scala.function.WindowFunction
    import org.apache.flink.streaming.api.watermark.Watermark
    import org.apache.flink.streaming.api.windowing.time.Time
    import org.apache.flink.streaming.api.windowing.windows.TimeWindow
    import org.apache.flink.util.Collector
    import utils.Utils
    
    import scala.collection.mutable.ListBuffer
    
    
    
    //输入数据样例类
    case class ApacheLogEvent( ip: String, //IP地址
                               userId: String, //用户ID
                               eventTime: Long, //用户点击广告时间
                               method: String, //请求方式
                               url: String) //请求的URL
    
    // 窗口聚合结果样例类
    case class UrlViewCount( url: String, //请求的URL
                             windowEnd: Long,  //所属窗口
                             count: Long ) //点击的次数
    
    
    /**
     * 热门页面统计
     */
    object HotPage {
    
      def main(args: Array[String]): Unit = {
        //获取执行环境
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        //设置时间
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)
    
        env.readTextFile(Utils.eventLogPath)  //读取到数据
          .map( Utils.string2ApacheLogEvent(_)) //使用面向对象的思想,对数据进行解析
          .assignTimestampsAndWatermarks(new HotPageEventTimeExtractor) //设置水位,允许数据迟到10秒
          .keyBy(_.url) //根据请求页面进行分组
          .timeWindow(Time.minutes(5),Time.seconds(5)) //设置窗口
          .aggregate(new PageCountAgg(),new PageWindowResult) //窗口URL进行统计
          .keyBy(_.windowEnd) //按照窗口进行分组
          .process(new TopNHotPage(5)) //实现排序的逻辑
          .print()
    
        env.execute("hot page count")
      }
    
    }
    
    class TopNHotPage(topSize:Int) extends KeyedProcessFunction[Long,UrlViewCount,String]{
      //申明一个state,里面存储URL和对应出现的次数
      //TODO 这个地方用ListState也可以
      lazy val urlState:MapState[String,Long] =
      getRuntimeContext.getMapState(new MapStateDescriptor[String,Long](
        "url-state-count",classOf[String],classOf[Long]))
    
      override def processElement(value: UrlViewCount,
                                  ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context,
                                  out: Collector[String]): Unit = {
        //来一条数据就把数据给存起来
        urlState.put(value.url,value.count)
        //注册定时器
        ctx.timerService().registerEventTimeTimer(value.windowEnd + 1)
      }
    
      override def onTimer(timestamp: Long,
                           ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext,
                           out: Collector[String]): Unit = {
        //里面可以实现排序
        val allUrlViews:ListBuffer[(String,Long)] = new ListBuffer[(String, Long)]()
    
        val iter = urlState.entries().iterator()
        while(iter.hasNext){
          val entry = iter.next()
          allUrlViews += ((entry.getKey,entry.getValue))
        }
        //清空state
        urlState.clear()
        //使用降序排序,求TopN
        val sortedUrlView = allUrlViews.sortWith(_._2 > _._2).take(topSize)
    
        val result = new StringBuilder()
        result.append("时间:").append(new Timestamp( timestamp -1)).append("
    ")
        sortedUrlView.foreach( view =>{
          result.append("URL:").append(view._1)
            .append(" 访问量:").append(view._2).append("
    ")
        })
        result.append("===================")
    
        out.collect(result.toString())
      }
    }
    
    /**
     * 自定义窗口处理函数
     */
    class PageWindowResult() extends WindowFunction[Long,UrlViewCount,String,TimeWindow]{
      override def apply(key: String, window: TimeWindow,
                         input: Iterable[Long],
                         out: Collector[UrlViewCount]): Unit = {
        //window.getEnd 标示我们的一个窗口
        out.collect(UrlViewCount(key,window.getEnd,input.iterator.next()))
      }
    }
    
    /**
     * 实现的是对URL进行聚合
     * sum
     * 辅助变量,累加变量
     */
    class PageCountAgg() extends AggregateFunction[ApacheLogEvent,Long,Long]{
      override def createAccumulator(): Long = 0L
    
      override def add(in: ApacheLogEvent, acc: Long): Long = acc + 1
    
      override def merge(acc: Long, acc1: Long): Long = acc + acc1
    
      override def getResult(acc: Long): Long = acc
    }
    
    
    /**
     * 定义waterMark
     */
    class HotPageEventTimeExtractor extends AssignerWithPeriodicWatermarks[ApacheLogEvent]{
    
      var currentMaxEventTime = 0L //设置当前窗口里面最大的时间
      val maxOufOfOrderness = 10000 //最大乱序时间 10s
      /**
       * 计算watermark
       * @return
       */
      override def getCurrentWatermark: Watermark = {
        new Watermark(currentMaxEventTime - maxOufOfOrderness)
      }
    
      /**
       * 指定我们的时间字段
       * @param element
       * @param previousElementTimestamp
       * @return
       */
      override def extractTimestamp(element: ApacheLogEvent, previousElementTimestamp: Long): Long = {
        //时间字段
        val timestamp = element.eventTime
        currentMaxEventTime = Math.max(element.eventTime, currentMaxEventTime)
        timestamp;
      }
    
    
    }

    3、Utils工具类

    package utils
    
    /**
     * @program: demo
     * @description: ${description}
     * @author: yang
     * @create: 2020-12-30 14:26
     */
    import java.text.SimpleDateFormat
    
    import service.{AdClickEvent, ApacheLogEvent, UserBehavior}
    
    
    object Utils {
    
      //时间日志路径
      val eventLogPath = "E:\java\demo\src\main\resources\file\data2.log"
      //广告点击日志路径
      val adClickLogPath = "E:\java\demo\src\main\resources\file\data3.csv"
    
      //用户行为数据日志路径
      val userBehaviorLogPath="E:\java\demo\src\main\resources\file\data1.csv"
    
    
      /**
       * 根据字符串把数据转换成为日志服务数据对象
       * @param line
       * @return
       */
      def string2ApacheLogEvent(line:String):ApacheLogEvent={
        val fields = line.split(" ")
        val dateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
        val timeStamp = dateFormat.parse(fields(3).trim).getTime
        ApacheLogEvent(fields(0).trim,fields(1).trim,timeStamp,
          fields(5).trim,fields(6).trim)
      }
    
      /**
       * 根据字符串生成广告点击日志对象
       * @param line
       * @return
       */
      def string2ClickEvent(line:String):AdClickEvent={
        val dataArray = line.split(",")
        AdClickEvent(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim, dataArray(3).trim, dataArray(4).trim.toLong)
      }
    
      /**
       * 根据字符串,把数据转换成为用户行为对象
       * @param line
       * @return
       */
      def string2UserBehavior(line:String):UserBehavior={
        val fields = line.split(",")
        UserBehavior(fields(0).trim.toLong,
          fields(1).trim.toLong,
          fields(2).trim.toLong,
          fields(3).trim,
          fields(4).trim.toLong,
          fields(5).trim
        )
    
      }
    
    }
  • 相关阅读:
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!
    Scapy 工具介绍
    ubuntu虚拟机使用open-vm-tools代替vmware-tools
    docker、vmware和PD的区别
    ubuntu查看OpenGL版本
    SQL Server 常用近百条SQL语句(收藏版)
    Intellij IDEA 如何去掉 @Autowired 注入警告
    awtk-linux-fb 使用 double framebuffer 闪烁的问题
  • 原文地址:https://www.cnblogs.com/ywjfx/p/14234937.html
Copyright © 2020-2023  润新知