OpenIOSSG
/
gitAnalysis

 
			
							#!/usr/bin/python
# -*- coding: utf-8 -*-

from pydriller import RepositoryMining
from pydriller import GitRepository
from pydriller.metrics.process.change_set import ChangeSet
from pydriller.metrics.process.code_churn import CodeChurn
from pydriller.metrics.process.commits_count import CommitsCount
from pydriller.metrics.process.contributors_count import ContributorsCount
from pydriller.metrics.process.contributors_experience import ContributorsExperience
from pydriller.metrics.process.hunks_count import HunksCount
from pydriller.metrics.process.lines_count import LinesCount
from datetime import datetime
import psycopg2
import time
import calendar
from collections import defaultdict

# from matplotlib import pyplot as plt
# import pandas as pd #用于生成满足绘图要求的数据格式
# import statsmodels.api as sm #用于局部加权回归


def plot(x, y):
    # ********* Begin *********#
    fig,ax=plt.subplots() #subplots返回画布和子图
    crime2=crime[~crime['state'].isin(['District of Columbia','United States'])]
    ax.set_xlim(0,10) #x轴范围从0到10
    ax.set_ylim(0,1200) #y轴范围从0到1200
    ax.set_xlabel("crime murder", fontsize=12)  #设置x轴标签
    ax.set_ylabel("crime burglary", fontsize=12)  #设置y轴标签    
    #ax.set_title("美国谋杀率和入室盗窃率",fontproperties='SimHei',fontsize=16) #为图片添加标题
    lowess = sm.nonparametric.lowess(crime2["burglary"], crime2["murder"])

    ax.plot(lowess[:, 0], lowess[:, 1]) #绘制曲线图
    ax.plot(crime2["murder"],crime2["burglary"],"*",color="#00CC88") #绘制散点图  

    
    plt.rcParams['figure.figsize'] = (8.0, 4.0)
    plt.show() #展示图像
    # ********* End *********#
    plt.savefig('matplotlibScatter/studentanswer/level_2/crime.png') #保存为png格式
    plt.close() #关闭画布窗口
 

def others_fun():
    
    # /***Change Set***/
    metric = ChangeSet(path_to_repo,                       
                        since=dt1, 
                        to=dt2)
    # the maximum and average number of files committed together                     
    maximum = metric.max()
    average = metric.avg()
    #print('Maximum number of files committed together: {}'.format(maximum))
    #print('Average number of files committed together: {}'.format(average))  
    
    # /***Commits Count***/
    # the number of commits made to a file
    metric = CommitsCount(path_to_repo,
                          since=dt1, 
                          to=dt2)
    files = metric.count()
    # the number of commits for each modified file in the evolution period 
    #print('Files: {}, ***num: {}'.format(files, len(files)))
    
    # /***Contributors Count***/
    metric = ContributorsCount(path_to_repo,
                               since=dt1, 
                               to=dt2)
    count = metric.count()
    minor = metric.count_minor()
    # the number of developers that contributed to each of the modified file in the evolution period 
    # and the number of developers that contributed less than 5% to each of the modified file in the evolution period
    #print('Number of contributors per file: {}, ***num: {}'.format(count, len(count)))
    #print('Number of "minor" contributors per file: {}, ***num: {}'.format(minor, len(minor)))

    # /***Contributors Experience***/
    metric = ContributorsExperience(path_to_repo,
                                    since=dt1, 
                                    to=dt2)
    files = metric.count()
    # the percentage of the lines authored by the highest contributor of a file
    # for each of the modified file in the evolution period
    #print('Files: {}, ***num: {}'.format(files, len(files)))
    
    #/*** Hunks Count***/
    # As a hunk is a continuous block of changes in a diff
    # this number assesses how fragmented the commit file is
    metric = HunksCount(path_to_repo,
                        since=dt1, 
                        to=dt2)
    files = metric.count()
    # the median number of hunks for each of the modified file in the evolution period 
    #print('Files: {}, ***num: {}'.format(files, len(files)))


    # /***Code Churn***/
    # A code churn is the sum of (added lines - removed lines)
    metric = CodeChurn(path_to_repo,                       
                        since=dt1, 
                        to=dt2)
    # measures the code churns of a file.                     
    files_count = metric.count()
    files_max = metric.max()
    files_avg = metric.avg()
    #print('Total code churn for each file: {}'.format(files_count))
    #print('Maximum code churn for each file: {}'.format(files_max))
    #print('Average code churn for each file: {}'.format(files_avg)) 
    
    # the total number of added and removed lines for each modified file
    total_count = metric.count() 
    line_num = 0
    file_num = 0
    
    '''
    #print('momodified file,  added_count,   added_max,  added_avg,  removed_count,  removed_max,    removed_avg')
    for key,value in total_count.items():
        #print('Total lines modified in {}: {}'.format(key, value))
        
        #if dict.has_key(key):
        if value == 0:
            continue
        else:
            line_num += value
            file_num += 1
            print(key, added_count[key], added_max[key], added_avg[key], removed_count[key], removed_max[key], removed_avg[key])
    '''
    file_num, line_num = add_values(total_count)
    '''
    print('Total lines removed per file: {}'.format(len(removed_count)))
    print('Maximum lines removed per file: {}'.format(len(removed_max)))
    print('Average lines removed per file: {}'.format(len(removed_avg)))           
    print('Total lines added per file: {}'.format(len(added_count)))
    print('Maximum lines added per file: {}'.format(len(added_max)))
    print('Average lines added per file: {}'.format(len(added_avg)))
    print('The num of files: {}'.format(len(total_count)))
    print('The num of files modified: {}'.format(file_num))
    print('Total lines modified in all files: {}'.format(line_num))                 
    '''

def add_values(mapdata):
    key_num = 0
    val_num = 0 
    for key,value in mapdata.items():
        if value == 0:
            continue
        else:
            key_num += 1
            val_num += value
    return key_num, val_num


# connect database
conn = psycopg2.connect(database="gitea", user="gitea", password="gitea", host="127.0.0.1", port="5432")
print("Opened database successfully")

cur = conn.cursor()

def create_tabel():
    #cur = conn.cursor()
    cur.execute('''CREATE TABLE COMPANY
        (ID INT PRIMARY KEY     NOT NULL,
        NAME           TEXT    NOT NULL,
        AGE            INT     NOT NULL,
        ADDRESS        CHAR(50),
        SALARY         REAL);''')


    print("Table created successfully")

    conn.commit()
    conn.close()

def insert():
    #cur = conn.cursor()

    cur.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
          VALUES (1, 'Paul', 32, 'California', 20000.00 )");

    cur.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (2, 'Allen', 25, 'Texas', 15000.00 )");

    cur.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (3, 'Teddy', 23, 'Norway', 20000.00 )");

    cur.execute("INSERT INTO COMPANY (ID,NAME,AGE,ADDRESS,SALARY) \
      VALUES (4, 'Mark', 25, 'Rich-Mond ', 65000.00 )");

    conn.commit()
    print("Records created successfully")
    conn.close()

def select():
    cur.execute("SELECT id, name, address, salary  from COMPANY")
    rows = cur.fetchall()
    for row in rows:
        print("ID = ", row[0])
        print("NAME = ", row[1])
        print("ADDRESS = ", row[2])
        print("SALARY = ", row[3], "\n")

    print("Operation done successfully")
    conn.close()


def update():
    cur.execute("UPDATE COMPANY set SALARY = 25000.00 where ID=1")
    conn.commit
    print("Total number of rows updated :", cur.rowcount)

    cur.execute("SELECT id, name, address, salary  from COMPANY")
    rows = cur.fetchall()
    for row in rows:
        print("ID = ", row[0])
        print("NAME = ", row[1])
        print("ADDRESS = ", row[2])
        print("SALARY = ", row[3], "\n")

    print("Operation done successfully");
    conn.close()

def delete():
    cur.execute("DELETE from COMPANY where ID=2;")
    conn.commit
    print("Total number of rows deleted :", cur.rowcount)

    cur.execute("SELECT id, name, address, salary  from COMPANY")
    rows = cur.fetchall()
    for row in rows:
        print("ID = ", row[0])
        print("NAME = ", row[1])
        print("ADDRESS = ", row[2])
        print("SALARY = ", row[3], "\n")

    print("Operation done successfully");
    conn.close()


#def delete_tabel():


if __name__ == '__main__':

    ticks = time.time()
    # print('local time ticks is {}'.format(ticks))
    localtime = time.localtime(time.time())
    # print('local time is {}'.format(localtime))


    change_time = ticks - 60*60*24
    # print('after change time, ticks is {}'.format(change_time))
    change_local = time.localtime(change_time)
    # print('after change time local time is {}'.format(change_local) )


    time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('now time: {}'.format(time_str))
    time_unix = time.mktime(time.strptime(time_str,"%Y-%m-%d %H:%M:%S"))
    # print('time_unix: {}'.format(time_unix))

    change_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(change_time))
    print('before time: {}'.format(change_str))


    #dt1 = datetime(change_local.tm_year, change_local.tm_mon, change_local.tm_mday, change_local.tm_hour, change_local.tm_min, change_local.tm_sec)
    dt1 = datetime(change_local.tm_year, change_local.tm_mon, change_local.tm_mday)
    dt2 = datetime(localtime.tm_year, localtime.tm_mon, localtime.tm_mday, localtime.tm_hour, localtime.tm_min, localtime.tm_sec)
    
    #dt1 = datetime(2020, 7, 16)
    #dt2 = datetime(2020, 7, 24)    
    
    from_commit='from commit hash'
    to_commit='to commit hash'
    from_tag = ''
    to_tag = ''
    # path_to_repo = '/home/pcl/guomy/code/profile/repos/opendata'
    # path_to_repo = "https://github.com/golang/vscode-go.git"
    # path_to_repo = 'https://github.com/PaddlePaddle/Paddle.git'
    path_to_repo = '/home/pcl/guomy/code/Paddle'


    # /*** analysis per commit***/
    rm = RepositoryMining(path_to_repo, 
                        since=dt1, 
                        to=dt2,
                        # only_commits = ['dfb3ae1b9b44f351526d0f5b04e2656e127ed9be']
                        # only_authors = ['chenwhql']
                        # only_modifications_with_file_types=['.go', '.html', '.js', '.json', '.sample', '.py', '.ini', '.tmp1']
                        # only_in_branch='master',
                        only_no_merge=True 
                        ).traverse_commits()
    commit_author = {}
    commit_author_mod = {}
    commit_author_add = {}
    commit_author_del = {}    
    commit_master = 0
    commit_other = 0

    commit_branchs = {}

    commit_file = {}
    commit_file_mod = {}
    commit_file_add = {}
    commit_file_del = {}    
    
    commit_author_file = defaultdict(defaultdict)
    commit_author_file_mod = defaultdict(defaultdict)
    commit_author_file_add = defaultdict(defaultdict)
    commit_author_file_del = defaultdict(defaultdict)

    commit_file_author = defaultdict(defaultdict)
    commit_file_author_mod = defaultdict(defaultdict)
    commit_file_author_add = defaultdict(defaultdict)
    commit_file_author_del = defaultdict(defaultdict)

    authors = ['zhiqiu','Chen Weihang','Aurelius84']

    # Commit Object
    for commit in rm:
        name = commit.author.name
        if name in commit_author.keys():
            commit_author[name] += 1 
        else:
            commit_author[name] = 1
            commit_author_add[name] = 0
            commit_author_del[name] = 0
            commit_author_mod[name] = 0


        for br in commit.branches:
            if br in commit_branchs.keys():
                commit_branchs[br] += 1
            else:
                commit_branchs[br] = 1


        if 'master' in commit.branches:
        #if 'develop' in commit.branches:
            commit_master += 1
        else:
            commit_other += 1       


        #print("****************************begin****************************")
        
        print(
            #'project_name: {}'.format(commit.project_name),
            'author_date: {}'.format(commit.author_date),
            '|Hash: {}'.format(commit.hash),
            '|author: {}'.format(commit.author.name),
            #'|comment: {}'.format(commit.msg),
            #'|{}'.format(commit.committer.name),  
            #'author_timezone: {}'.format(commit.author_timezone), 
            #'committer: {}'.format(commit.committer), 
            #'committer_date: {}'.format(commit.committer_date), 
            #'committer_timezone: {}'.format(commit.committer_timezone), 
            'branches: {}'.format(commit.branches), 
            'in_main_branch: {}'.format(commit.in_main_branch), 
            'merge: {}'.format(commit.merge), 
            #'parents: {}'.format(commit.parents), 
            #'project_path:{} '.format(commit.project_path)
        )
        print("-------------------------------------------------------------")
        

        # for each modifications per file
        for m in commit.modifications:
            '''
            if (m.filename).endswith(('.jpg', '.png')):
                continue
            '''
            commit_author_add[name] += m.added
            commit_author_del[name] += m.removed
            commit_author_mod[name] += (m.added + m.removed)

            if m.filename in commit_author_file[name].keys():
                commit_author_file[name][m.filename] += 1
                commit_author_file_mod[name][m.filename] += (m.added + m.removed)
                commit_author_file_add[name][m.filename] += m.added
                commit_author_file_del[name][m.filename] += m.removed

            else:
                commit_author_file[name][m.filename] = 1
                commit_author_file_mod[name][m.filename] = (m.added + m.removed)
                commit_author_file_add[name][m.filename] = m.added
                commit_author_file_del[name][m.filename] = m.removed


            if m.filename in commit_file.keys():
                commit_file[m.filename] += 1
                commit_file_add[m.filename] += m.added
                commit_file_del[m.filename] += m.removed
                commit_file_mod[m.filename] += (m.added + m.removed)
            else:
                commit_file[m.filename] = 1
                commit_file_add[m.filename] = m.added
                commit_file_del[m.filename] = m.removed
                commit_file_mod[m.filename] = (m.added + m.removed)

            '''
            print(
                "Author {}".format(commit.author.name),
                " modified {}".format(m.filename),
                #" changed_methods are {}".format(m.changed_methods),
                " with a change type of {}".format(m.change_type.name),
                " added {} removed {}".format(m.added, m.removed),
                #" and the complexity is {}".format(m.complexity)                
            )
            '''
            

    commit_author_order = sorted(commit_author.items(),key=lambda x:x[1],reverse=True)
    commit_author_add_order = sorted(commit_author_add.items(),key=lambda x:x[1],reverse=True)
    commit_author_del_order = sorted(commit_author_del.items(),key=lambda x:x[1],reverse=True)

    commit_file_order = sorted(commit_file.items(),key=lambda x:x[1],reverse=True)
    commit_file_add_order = sorted(commit_file_add.items(),key=lambda x:x[1],reverse=True)
    commit_file_del_order = sorted(commit_file_del.items(),key=lambda x:x[1],reverse=True)
        
    top = 5
    for f,v in commit_file_order:
        if top <= 0:
            break
        print('file: {},commit: {},modify: {},add: {},del: {} '.format(f, v, commit_file_mod[f], commit_file_add[f], commit_file_del[f]))
        top -=1

    top = 15
    for f,v in commit_author_order:
        if top <= 0:
            break
        print('author: {},commit: {},modify: {},add: {},del: {} '.format(f, v, commit_author_mod[f], commit_author_add[f], commit_author_del[f]))
        top -=1


    # test
    # authors = ['zhiqiu','Chen Weihang','Aurelius84']
    for author in authors: 
        if author in commit_author.keys():
            print('author: {},commit: {},modify: {},add: {},del: {} '.format(author, commit_author[author], commit_author_mod[author], commit_author_add[author], commit_author_del[author]))
        else:
            print('author {} not commit among the time'.format(author))
     
    print("*****************************end*****************************")

    
    # /*** common git***/
    gr = GitRepository(path_to_repo)
    # get total number of commits
    n_cm = gr.total_commits()

    # get the list of all commits
    #gr.get_list_commits()   
                   
    # get the specific commit
    #gr.get_commit('cc5b002') 

    # get the commit with tag v1.15
    #gr.get_commit_from_tag('v1.15')  
                        
    # get the list of files present in the repo at the current commit
    file_map = {}
    for item in gr.files():
        file_map[item] = 1
   
    print('The num of files current in the repo is {}'.format(len(file_map)) )
    #print('The total number of commits is {}'.format(gr.total_commits()) ) 
    print('The total number of commits is {}'.format(n_cm) ) 

   
    #/*** Lines Count per file***/
    metric = LinesCount(path_to_repo,
                        since=dt1, 
                        to=dt2)
    
    # the total, maximum and average number of lines added for each modified file in the evolution period
    added_count = metric.count_added()
    added_max = metric.max_added()
    added_avg = metric.avg_added()
    #print('Total lines added per file: {}'.format(added_count))
    #print('Maximum lines added per file: {}'.format(added_max))
    #print('Average lines added per file: {}'.format(added_avg))
    # the total, maximum and average number of lines removed for each modified file in the evolution period
    removed_count = metric.count_removed()
    removed_max = metric.max_removed()
    removed_avg = metric.avg_removed()
    #print('Total lines removed per file: {}'.format(removed_count))
    #print('Maximum lines removed per file: {}'.format(removed_max))
    #print('Average lines removed per file: {}'.format(removed_avg))  
    

    mod_file = {}
    mod_add = 0
    mod_removed = 0

    for key,value in added_count.items():
        if value == 0:

            continue
        else:
            mod_add += value
            mod_file[key] = 1

    for key,value in removed_count.items():
        if value == 0:
            continue
        else:
            mod_removed += value
            mod_file[key] = 1
    
    
    _,m_add = add_values(commit_author_add)
    _,m_del = add_values(commit_author_del)

    branch_info = ''
    for k,v in commit_branchs.items():
        if k == 'master':
            continue
        else:
            if branch_info != '':
                branch_info += ','
            branch_info += str(v) + ' commits to ' + k

    print('Excluding merges, {} authors have pushed {} commits to master and {} commits to all branches, including {}.'.format(
        len(commit_author), 
        commit_master, 
        commit_other,
        branch_info))


    print('On all branchs including master, {} files have changed and there have been {},{} additions and {},{} deletions.'.format(
        len(mod_file), 
        mod_add, m_add, mod_removed, m_del))