• 每日 mark


    SIGNAL=${SIGNAL:-TERM}

    PIDS=$(jps -lm | grep -i 'kafka.Kafka' | awk '{print $1}')
    if [ -z "$PIDS" ]; then
    echo "No kafka server to stop"
    exit 1
    else
    kill -s $SIGNAL $PIDS
    fi

    PIDS=$(ps ax | grep -i 'kafka.Kafka' | grep java | grep -v grep | awk '{print $1}')

    if [ -z "$PIDS" ]; then
    echo "No kafka server to stop"
    exit 1
    else
    kill -s TERM $PIDS
    fi

    1.面试题目集合

    2.分布式实现算法

    3.spark mllib



    package sqlparser;
    import java.io.*;
    public class func{
    public static String readToString(String fileName) {
    String encoding = "UTF-8";
    File file = new File(fileName);
    Long filelength = file.length();
    byte[] filecontent = new byte[filelength.intValue()];
    try {
    FileInputStream in = new FileInputStream(file);
    in.read(filecontent);
    in.close();
    } catch (FileNotFoundException e) {
    e.printStackTrace();
    } catch (IOException e) {
    e.printStackTrace();
    }
    try {
    return new String(filecontent, encoding);
    } catch (UnsupportedEncodingException e) {
    System.err.println("The OS does not support " + encoding);
    e.printStackTrace();
    return null;
    }
    }}





    秋天的颜色01 name+qq
    Paic880807











    import numpy as np
    import pandas as pd
    pd.set_option('display.max_columns', 10)
    pd.set_option('expand_frame_repr', False)
    def loadData():
    df_off = pd.read_csv(r'ccf_offline_stage1_train.csv')
    df_on = pd.read_csv(r'ccf_online_stage1_train.csv')
    df_test = pd.read_csv(r'ccf_offline_stage1_test_revised.csv')

    return df_off ,df_on ,df_test
    # return df_off[:10],df_on[:10],df_test[:10]
    # User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date
    # 0 1439408 2632 NaN NaN 0.0 NaN 20160217.0
    # 1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN
    # 2 1439408 2632 8591.0 20:1 0.0 20160217.0 NaN
    # 3 1439408 2632 1078.0 20:1 0.0 20160319.0 NaN
    # 4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN
    # User_id Merchant_id Action Coupon_id Discount_rate Date_received Date
    # 0 13740231 18907 2 100017492 500:50 20160513.0 NaN
    # 1 13740231 34805 1 NaN NaN NaN 20160321.0
    # 2 14336199 18907 0 NaN NaN NaN 20160618.0
    # 3 14336199 18907 0 NaN NaN NaN 20160618.0
    # 4 14336199 18907 0 NaN NaN NaN 20160618.0
    # User_id Merchant_id Coupon_id Discount_rate Distance Date_received
    # 0 4129537 450 9983 30:5 1.0 20160712
    # 1 6949378 1300 3429 30:5 NaN 20160706
    # 2 2166529 7113 6928 200:20 5.0 20160727
    # 3 2166529 7113 1808 100:10 5.0 20160727
    # 4 6172162 7605 6500 30:1 2.0 20160708
    # 0 977900 'User_id','Merchant_id','Coupon_id','Discount_rate','Date_received','Date'
    # -1 701602
    # 1 75382
    # Name: label, dtype: int64
    # -1 10557469
    # 0 655898
    # 1 216459
    df_off,df_on,df_test = loadData()

    df_off['label'] = -1
    df_off.loc[df_off['Coupon_id'].notnull() & df_off['Date'].notnull(),'label'] = 1
    df_off.loc[df_off['Coupon_id'].notnull() & df_off['Date'].isnull(),'label'] = 0

    df_on['label'] = -1
    df_on.loc[df_on['Coupon_id'].notnull() & df_on['Date'].notnull(),'label'] = 1
    df_on.loc[df_on['Coupon_id'].notnull() & df_on['Date'].isnull(),'label'] = 0


    real_off = df_off[df_off.label.isin([0,1])]
    real_on = df_on[df_on.label.isin([0,1])]

    real_all = pd.concat([real_off[['User_id','Merchant_id','Coupon_id','Discount_rate','Date_received','Date','label']],real_on[['User_id','Merchant_id','Coupon_id','Discount_rate','Date_received','Date','label']]])
    print (real_all.iloc[:,0].size,real_off.iloc[:,0].size,real_on.iloc[:,0].size)
    real_all['tmp'] = pd.to_datetime( (real_all['Date_received'].astype(int).apply(str)))
    real_all['weekday'] = real_all['tmp'].dt.weekday_name
    print (real_all.groupby(['weekday','label']).count())
    # pd.pivot_table(real_all,values = 'label',index='weekday')




















    http://mooc.study.163.com/university/deeplearning_ai#/c



    __author__ = 'Administrator'
    import time
    import pandas as pd
    def runtime(func):
    def wrapper(*args,**kwargs):
    t1 = time.time()
    func(*args,**kwargs)
    t2=time.time()
    print ("{0}函数调用耗时:{1:.2f}".format (func.__name__,t2-t1))
    return wrapper()


    def loadData():
    df_train = pd.read_csv("weibo_train_data.txt",header = None,sep = ' ')
    df_train.columns = ["uid","mid","date","forward","comment","like","content"]
    df_test = pd.read_csv("weibo_predict_data.txt",header = None,sep = ' ')
    df_test.columns = ["uid","mid","date","content"]
    return df_train,df_test

    def dataProcess(data):
    df = data.groupby('uid').agg(['median','mean'])
    df.columns =[ 'forward_median','forward_mean','comment_median','comment_mean','like_median','like_mean']
    train_stat = df.apply(pd.Series.round)
    uid_dict = {}
    for uid,row in df.iterrows():
    uid_dict[uid] = row
    return uid_dict


    def fill_with_fixed_data(f,c,l):
    df_train,df_test = loadData()
    df1 = df_test[['uid','mid']]
    df1['forward'] = f
    df1['comment'] = c
    df1['like'] = l
    result = []
    for _,row in df1.iterrows():
    result.append("{0} {1} {2},{3},{4} ".format(row[0],row[1],row[2],row[3],row[4]))
    filename = "weibo_predict_{}_{}_{}.txt".format( f,c,l)
    f= open(filename,'w')
    f.writelines(result)
    f.close()
    return result


    def fill_with_stat_data(stat = 'median'):
    df_train,df_test = loadData()
    uid_dict = dataProcess(df_train)
    df1 = df_test[['uid','mid']]
    forward,comment,like = [],[],[]
    print (uid_dict)
    for uid in df_test['uid']:
    if uid in uid_dict:
    forward.append(int(uid_dict[uid]["forward_"+stat]))
    comment.append(int(uid_dict[uid]["comment_"+stat]))
    like.append(int(uid_dict[uid]["like_"+stat]))
    else:
    forward.append(0)
    comment.append(0)
    like.append(0)
    df1['forward'] = forward
    df1['comment'] = comment
    df1['like'] = like
    result = []
    for _,row in df1.iterrows():
    result.append("{0} {1} {2},{3},{4} ".format(row[0],row[1],row[2],row[3],row[4]))
    filename = "weibo_predict_{}.txt".format( stat)
    f= open(filename,'w')
    f.writelines(result)
    f.close()
    return result

    fill_with_stat_data( )










    from numpy import *
    import operator
    from functools import reduce
    def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
    ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
    return postingList,classVec


    def createVocabList(dataSet):
    vocabSet = set(reduce(operator.add, dataSet))
    return list(vocabSet)



    def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
    if word in vocabList:
    returnVec[vocabList.index(word)] = 1
    else: print ("the word: %s is not in my Vocabulary!" % word)
    return returnVec






















    from numpy import *
    from os import listdir
    from numpy.ma import zeros
    class kNN(object):
    def __init__(self,**kwargs):
    pass

    def data2matrix(self):
    fr = open('G:zqh_workMLdatasetsml_acCh02datingTestSet2.txt')
    lines = fr.readlines()
    line_num = len(lines)
    mat = zeros((line_num,3))
    labels = []
    for i in range(line_num):
    mat[i] = lines[i].strip().split(' ')[0:3]
    labels[i] = lines[i].strip().split(' ')[-1]
    return mat,labels

    def norm_data(self,mat):
    max = mat.max(0)
    min = mat.min(0)
    diff = max - min
    rows = mat.shape[0]
    norm_mat = (mat - tile(min,(rows,1)))/tile(diff,(rows,1))
    return norm_mat

    def classify(self,inX,norm_mat,labels,k):
    rows = norm_mat.shape[0]
    for i in range(rows):



    def test(self):
    mat,labels = self.data2matrix()
    norm_mat = self.norm_data(mat)
    print (norm_mat)
    return norm_mat


    if __name__ == '__main__':
    knn=kNN()
    knn.test()

    http://keras-cn.readthedocs.io/en/latest/

    http://wiki.jikexueyuan.com/project/tensorflow-zh/get_started/introduction.html

    https://segmentfault.com/a/1190000002766035

    https://www.jianshu.com/p/8bb456cb7c77 

    http://cache.baiducontent.com/c?m=9f65cb4a8c8507ed4fece763104d96275e03c1743ca083572c85c91f84642c1c0733fee37c6243198385212240f8543d8883560b200356b799c28f4ac9fecf6879877a74250b873105d36eb8ca36768373c100beb81897adf04584afa2929d07139344040a97f0fc4d01648b2cae033093b1993f025e60eda76734b81f2c74c33441c650f997256f77d1b189081b837d867610e7ef68f52913c548e2485b7702fd0ca6092131309758268f1e6e4585ea2dbb7d3306&p=c2769a479d9e0bb312bd9b7e0d1488&newp=8465c64ad49506e42abd9b7e0d1496231610db2151d7d4146b82c825d7331b001c3bbfb423251003d2c0776600af495ee8f5367630032ba3dda5c91d9fb4c57479de607f02&user=baidu&fm=sc&query=org%2Eapache%2Espark%2Esql%2Eexecution%2EBufferedRowIterator%2EhasNext&qid=853831ee00006451&p1=7

    org.apache.spark.sql.execution.BufferedRowIterator.hasNext

    spark.write是否是分布式写?

    scala 事务控制?

    yarn的web ui 配置,rm是哪台机器 ?

    为什么不用yarn-cluster?不好收集日志?

     executor 日志 如何 查看 ?

    spark的几个配置文件适用情形?

     https://www.cnblogs.com/sorco/p/7070922.html

    http://hongjiang.info/scala/    写点什么

     spark executor 日志:$SPARK_HOME/work/$app_id/$executor_id/stdout

    总结一下Spark中各个角色的JVM参数设置:    

    (1)Driver的JVM参数:
    -Xmx,-Xms,如果是yarn-client模式,则默认读取spark-env文件中的SPARK_DRIVER_MEMORY值,-Xmx,-Xms值一样大小;如果是yarn-cluster模式,则读取的是spark-default.conf文件中的spark.driver.extraJavaOptions对应的JVM参数值。
    PermSize,如果是yarn-client模式,则是默认读取spark-class文件中的JAVA_OPTS="-XX:MaxPermSize=256m $OUR_JAVA_OPTS"值;如果是yarn-cluster模式,读取的是spark-default.conf文件中的spark.driver.extraJavaOptions对应的JVM参数值。
    GC方式,如果是yarn-client模式,默认读取的是spark-class文件中的JAVA_OPTS;如果是yarn-cluster模式,则读取的是spark-default.conf文件中的spark.driver.extraJavaOptions对应的参数值。
    以上值最后均可被spark-submit工具中的--driver-java-options参数覆盖。

    (2)Executor的JVM参数:
    -Xmx,-Xms,如果是yarn-client模式,则默认读取spark-env文件中的SPARK_EXECUTOR_MEMORY值,-Xmx,-Xms值一样大小;如果是yarn-cluster模式,则读取的是spark-default.conf文件中的spark.executor.extraJavaOptions对应的JVM参数值。
    PermSize,两种模式都是读取的是spark-default.conf文件中的spark.executor.extraJavaOptions对应的JVM参数值。
    GC方式,两种模式都是读取的是spark-default.conf文件中的spark.executor.extraJavaOptions对应的JVM参数值。

    (3)Executor数目及所占CPU个数
    如果是yarn-client模式,Executor数目由spark-env中的SPARK_EXECUTOR_INSTANCES指定,每个实例的数目由SPARK_EXECUTOR_CORES指定;如果是yarn-cluster模式,Executor的数目由spark-submit工具的--num-executors参数指定,默认是2个实例,而每个Executor使用的CPU数目由--executor-cores指定,默认为1核。
    每个Executor运行时的信息可以通过yarn logs命令查看到,类似于如下:

    14/08/13 18:12:59 INFO org.apache.spark.Logging$class.logInfo(Logging.scala:58): Setting up executor with commands: List($JAVA_HOME/bin/java, -server, -XX:OnOutOfMemoryError='kill %p', -Xms1024m -Xmx1024m , -XX:PermSize=256M -XX:MaxPermSize=256M -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintHeapAtGC -Xloggc:/tmp/spark_gc.log, -Djava.io.tmpdir=$PWD/tmp, -Dlog4j.configuration=log4j-spark-container.properties, org.apache.spark.executor.CoarseGrainedExecutorBackend, akka.tcp://spark@sparktest1:41606/user/CoarseGrainedScheduler, 1, sparktest2, 3, 1>, <LOG_DIR>/stdout, 2>, <LOG_DIR>/stderr)

        其中,akka.tcp://spark@sparktest1:41606/user/CoarseGrainedScheduler表示当前的Executor进程所在节点,后面的1表示Executor编号,sparktest2表示ApplicationMaster的host,接着的3表示当前Executor所占用的CPU数目。

    先在spark-env.sh 增加SPARK_HISTORY_OPTS;

    然后启动start-history-server.sh服务;

    就可以看到启动了HistoryServer进程,且监听端口是18080。

    之后就可以在web上使用http://hostname:18080愉快的玩耍了。



    作者:俺是亮哥
    链接:https://www.jianshu.com/p/65a3476757a5
    來源:简书
    著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。

    问题的症结就在于:闭包没有办法序列化。在这个例子里,闭包的范围是:函数parser以及它所依赖的一个隐式参数: formats , 而问题就出在这个隐式参数上, 它的类型是DefaultFormats,这个类没有提供序列化和反序列自身的说明,所以Spark无法序列化formats,进而无法将task推送到远端执行。

    隐式参数formats是为extract准备的,它的参数列表如下:

    org.json4s.ExtractableJsonAstNode#extract[A](implicit formats: Formats, mf: scala.reflect.Manifest[A]): A = ...
    • 1

    找到问题的根源之后就好解决了。实际上我们根本不需要序列化formats, 对我们来说,它是无状态的。所以,我们只需要把它声明为一个全局静态的变量就可以绕过序列化。所以改动的方法就是简单地把implicit val formats = DefaultFormats的声明从方法内部迁移到App Object的字段位置上即可。

    {
    "color_scheme": "Packages/Color Scheme - Default/Monokai.tmTheme",
    "font_size": 13,
    "ignored_packages":
    [
    "Vintage"
    ],
    "preview_on_click": false,
    "word_wrap": "true"
    }

    ambari  部署

    scala : 静态方法,单例对象,伴生对象

    spark job server

    spark etl

    spark 资源限制      

    yarn   资源队列限制  用户限制  

    算法 

    hdfs 挂载  像访问自己的目录

    f5

    keepalived

    sdg agent 采集redolog

    active mq vs kafka

    tomcat 备份

    finixs

    组的概念? hadoop组

    hue权限控制

    spark thirft service 

    spark submit

    ranger 只控制sdo?不能控制命令行?

    sms抓取元数据与ctm 配对

     ranger ............

     scala 闭包 

    java 内部类

    /etc/security/limits.conf

    同步命令:scp –r /seabox/develop/  26.6.0.141:/seabox

    谓词下推

     整理要了解的业务知识

    (select z.*,row_number() over(partition by z.deal_no order by z.biz_date desc) rn
    from bridge.summit_i_repo_general_info_ib z
    where z.deal_no not in (select distinct deal_no from bridge.summit_i_repo_general_info_ib where deal_status in ('3', '4') and biz_date <= '{DATE_YYYYMMDD}')
    )

    如何匹配 :received 数字 rows  ?

    START_TIME=`date "+%Y-%m-%d %H:%M:%S"`   ????????

     awk -F: '{print"用户帐号:"$1}'

    sqoop 各参数

    kettle 导出为xml文件

     http://confluence.paic.com.cn:6060/pages/viewpage.action?pageId=2132765

     http://www.docin.com/p-1354952858.html

    oracle 连接:

    JDBC

    ODBC

    OCI

    JNDI

     http://logging.apache.org/log4j/2.x/

    查看当前进程:ps

    可以用来查找某一应用运行在哪里 :ps -aux | grep hive

    flume 收集log4j日志的例子:

    http://blog.csdn.net/nsrainbow/article/details/36875123

    H75244

    Uy1caTod6Hgb

    建表时没有定义分隔符,分桶等,在表建成之后还能不能再加上?

    val dfa = sc.parallelize(List(("1", "aa", "ab"), ("2", "bb", "bb"),("4", "dd", "dd"))).toDF("key", "val1", "val2")
    val dfb = sc.parallelize(List(("1", "aa", "ab"), ("2", "bb", "cc"), ("3", "cc", "cc"))).toDF("key", "val1", "val2")
    val dfc = sc.parallelize(List( ("key"),("val1"))).toDF("pkey")
    val rv1 = dfb.join(dfa, dfa("key") === dfb("key") and dfa("val1") === dfb("val1"), "outer").show()
    val tmp = dfc.select("pkey").collect().map(_(0).toString())
    val mid = new Array[org.apache.spark.sql.Column](tmp.length)
    for (i<- 0 until tmp.length) mid(i)=dfa(tmp(i))===dfb(tmp(i))
    val rv2 = dfb.join(dfa, mid.reduce(_ and _), "outer")

    val cols = dfb.columns
    val all_col = new Array[org.apache.spark.sql.Column](cols.length)
    for (i <- 0 until cols.length) all_col(i)=when(dfb("key").isNull, dfa(cols(i))).otherwise(dfb(cols(i))).as(cols(i))

    val rv3 = rv2.select(all_col:_*).show()


    rv2.select(when(dfb("key").isNull, dfa("key")).otherwise(dfb("key")).as("key"))

    import scala.collection.mutable.ArrayBuffer
    val cols = dfb.columns
    val a=dfb.dtypes
    val b = new ArrayBuffer[String]()
    for (i <- a if i._2=="IntegerType") b+=i._1

    val numArray = b.toArray
    val num_col = new Array[org.apache.spark.sql.Column](numArray.length)
    for (i <- 0 until numArray.length) num_col(i)=when(dfb("key").isNull, lit(0)).otherwise(dfb(numArray(i))).as(numArray(i))

    val strArray = cols.filterNot(numArray.contains(_))
    val str_col = new Array[org.apache.spark.sql.Column](strArray.length)
    for (i <- 0 until strArray.length) str_col(i)=when(dfb("key").isNull, dfa(strArray(i))).otherwise(dfb(strArray(i))).as(strArray(i))

    val rv3 = rv2.select((num_col ++ str_col):_*)
    rv3.show()

     

  • 相关阅读:
    UDP——python网络编程笔记
    2018.7计划
    二叉树
    第7.5章:类——Cookbook笔记
    网络编程 Cookbook
    网络编程 csapp
    第5.5章 迭代器与生成器——CookBook笔记
    第3.5章 数据结构与算法——CookBook笔记
    关于排序
    关于搜索
  • 原文地址:https://www.cnblogs.com/qiuhong10/p/7815749.html
Copyright © 2020-2023  润新知