さらに私は
piRSquared、
Borjaによって回答の小さな性能試験を行っている、すでに与えられた答えに
、およびjezrael:numTasks
ため
import timeit
import numpy as np
setup = """
import pandas as pd
import numpy as np
import string
# number of unique tasks
numTasks = %s
# number of rows in df
numRows = %s
## creating df
# columns for df
col1 = np.random.choice(range(numTasks), numRows)
col2 = np.random.choice(list(string.letters), numRows)
df = pd.DataFrame({ 'Task': col1,
'Emp':col2})
df = df.sort_values("Task").reset_index(drop=True)
# creating df1
tasks = df.Task.unique()
nTasks = len(tasks)
df1 = pd.DataFrame({ 'Task': tasks,
'Days': np.random.permutation(range(nTasks)) })
"""
solutionPiRSquared = """
pd.concat([d.set_index('Task') for d in [df, df1]], axis=1).reset_index(drop=True)
"""
solutionBorja = """
pd.merge(df, df1, on='Task')
"""
solutionJezrael = """
df.rename(columns={'Task':'Days'}, inplace=True)
df['Days'] = df['Days'].map(df1.set_index('Task')['Days'])
df = df[['Emp','Days']]
"""
numRepetitions = int(100)
solutions = [ { 'by': 'piRSquared',
'code': solutionPiRSquared,
'min': None,
'max': None,
'mean': None,
'std': None },
{ 'by': 'borja',
'code': solutionBorja,
'min': None,
'max': None,
'mean': None,
'std': None },
{ 'by': 'jezrael',
'code': solutionJezrael,
'min': None,
'max': None,
'mean': None,
'std': None } ]
# test several settings for number of tasks and number of rows
# for each setup each solution is executed <numRepetition> times
# and execution time is measured. min, max, mean, and standard
# deviation is calculated.
for (NUM_TASKS,NUM_ROWS) in [ (10,1000),
(100,10000),
(1000,10000),
(1000,100000),
(1000,1000000),
(10000,1000000),
(100000,1000000)]:
print "-----------------------------------------"
print "number of rows:",NUM_ROWS
print "number of tasks:",NUM_TASKS
print
for solution in solutions:
#print "solution by",solution['by']
result = np.array(timeit.repeat(solution["code"], setup=setup % (NUM_TASKS,NUM_ROWS), number=1, repeat=numRepetitions))
solution['min'] = result.min()
solution['max'] = result.max()
solution['mean'] = result.mean()
solution['std'] = result.std()
# sort solutions regarding the their mean value
solutions.sort(key=lambda s: s['mean'])
best = solutions[0]['mean']
# print sorted results along with relative increase of
# execution time relative to the fastest solution (for current
# setup
for idx,solution in enumerate(solutions):
d = { 'idx': idx+1,
'rel': "[rel to best: +{:.2f}]".format(100*(solution['mean']-best)/best) if idx>0 else '[best]',
'by': solution["by"],
'min': solution["min"],
'max': solution["max"],
'mean': solution["mean"],
'std': solution["std"] }
print "{idx}. {rel}: solution by {by}".format(**d)
print " min: {min:.4f}, mean: {mean:.4f}, std: {std:.4f}, max: {max:.4f})".format(**d)
print "-----------------------------------------"
いくつかの設定、すなわち、DFとnumRows
でユニークなタスクの数つまり、dfの行数がテストされ、実行時間の統計が計算されます。それはpython2.7に私のマシン(インテル®Core™2 DuoプロセッサのCPU P8700する@ 2.53GHz×2)で私を与える:
-----------------------------------------
number of rows: 1000
number of tasks: 10
1. [best]: solution by borja
min: 0.0020, mean: 0.0021, std: 0.0001, max: 0.0026)
2. [rel to best: +3.12]: solution by piRSquared
min: 0.0021, mean: 0.0022, std: 0.0002, max: 0.0030)
3. [rel to best: +14.46]: solution by jezrael
min: 0.0023, mean: 0.0024, std: 0.0002, max: 0.0032)
-----------------------------------------
-----------------------------------------
number of rows: 10000
number of tasks: 100
1. [best]: solution by piRSquared
min: 0.0026, mean: 0.0028, std: 0.0002, max: 0.0040)
2. [rel to best: +13.39]: solution by borja
min: 0.0028, mean: 0.0031, std: 0.0009, max: 0.0119)
3. [rel to best: +23.38]: solution by jezrael
min: 0.0033, mean: 0.0034, std: 0.0002, max: 0.0043)
-----------------------------------------
-----------------------------------------
number of rows: 10000
number of tasks: 1000
1. [best]: solution by piRSquared
min: 0.0027, mean: 0.0030, std: 0.0003, max: 0.0044)
2. [rel to best: +5.63]: solution by borja
min: 0.0030, mean: 0.0031, std: 0.0002, max: 0.0040)
3. [rel to best: +22.01]: solution by jezrael
min: 0.0034, mean: 0.0036, std: 0.0002, max: 0.0046)
-----------------------------------------
-----------------------------------------
number of rows: 100000
number of tasks: 1000
1. [best]: solution by piRSquared
min: 0.0092, mean: 0.0099, std: 0.0008, max: 0.0141)
2. [rel to best: +39.06]: solution by borja
min: 0.0130, mean: 0.0137, std: 0.0009, max: 0.0170)
3. [rel to best: +71.95]: solution by jezrael
min: 0.0163, mean: 0.0170, std: 0.0006, max: 0.0192)
-----------------------------------------
-----------------------------------------
number of rows: 1000000
number of tasks: 1000
1. [best]: solution by piRSquared
min: 0.0882, mean: 0.0915, std: 0.0025, max: 0.1013)
2. [rel to best: +50.27]: solution by borja
min: 0.1256, mean: 0.1375, std: 0.0104, max: 0.1828)
3. [rel to best: +75.97]: solution by jezrael
min: 0.1557, mean: 0.1610, std: 0.0047, max: 0.1862)
-----------------------------------------
-----------------------------------------
number of rows: 1000000
number of tasks: 10000
1. [best]: solution by piRSquared
min: 0.0887, mean: 0.0949, std: 0.0059, max: 0.1282)
2. [rel to best: +41.71]: solution by borja
min: 0.1247, mean: 0.1345, std: 0.0055, max: 0.1621)
3. [rel to best: +84.01]: solution by jezrael
min: 0.1668, mean: 0.1746, std: 0.0072, max: 0.2146)
-----------------------------------------
-----------------------------------------
number of rows: 1000000
number of tasks: 100000
1. [best]: solution by piRSquared
min: 0.0959, mean: 0.1006, std: 0.0036, max: 0.1177)
2. [rel to best: +51.91]: solution by borja
min: 0.1473, mean: 0.1528, std: 0.0047, max: 0.1800)
3. [rel to best: +77.68]: solution by jezrael
min: 0.1730, mean: 0.1787, std: 0.0059, max: 0.2087)
-----------------------------------------
concatは、この文脈では、ここでmergeとmapよりも性能が優れています。
20分前に既に提供されているのと同じ正確な回答です。 – piRSquared
@piRSquared回答が投稿される前に私は執筆を始めました。私はアプローチの限界を説明して以来、それはまったく同じ答えではありません... – Borja
@piRSquaredところで、重複した値がある場合は、concatを使用するアプローチは失敗します。例えば: DF: 'タスク\t Emp' ' 5 \t cc' '4 \t cc' ' 8 \t cc' '3 \t aa' ' 2 \t aa' \t 6 ' aa' '4 \t' 6 \t bb' cc' DF1: 'タスク\t Day' '1 \t 5' ' 7 \t 3' '0 \t 6' '6 \t 7' '8 \t 1' '9 \t 7' ' 5 \t 9' ' 9 \t 3' '3 \t 8' – Borja