2017-04-20 20 views
1

を1列グループの順列を作成します。私が欲しいもの私はこのようなデータフレームを持つ別の列パンダによって

In [1]: df = pd.DataFrame([['jon snow', 'jon-snow'], ['jon snow', 'jon+snow'], [jon snow, 'jonsnow']], columns=['name', 'name_variation']) 

は次のとおりです。

df_want = pd.DataFrame([['jon snow', 'jon-snow', 'jon-snow'], 
       ['jon snow', 'jon-snow', 'jon+snow'], 
       ['jon snow', 'jon-snow', 'jonsnow'], 
       ['jon snow', 'jon-snow', np.nan], 
       ['jon snow', 'jon+snow', 'jon-snow'], 
       ['jon snow', 'jon+snow', 'jon+snow'], 
       ['jon snow', 'jon+snow', 'jonsnow'], 
       ['jon snow', 'jon+snow', np.nan], 
       ['jon snow', 'jonsnow', 'jon-snow'], 
       ['jon snow', 'jonsnow', 'jon+snow'], 
       ['jon snow', 'jonsnow', 'jon-snow'], 
       ['jon snow', 'jonsnow', np.nan], 
       ['jon snow', np.nan, 'jon-snow'], 
       ['jon snow', np.nan, 'jon+snow'], 
       ['jon snow', np.nan, 'jonsnow'], 
       ['jon snow', np.nan, np.nan]], columns=['name', 'name_variation', 'name_variation_2']) 

私は動作しますが、長いったらしい感じている、これをしようとしていました。

def combinations(df): 
    df = df.drop_duplicates() 
    df = df.dropna() 

    df['k'] = df['brand_variation'] 
    df['val'] = 1 

    df_final = pd.DataFrame(columns=['brand', 'k', 'brand_variation',]) 
    for res in df['brand'].unique(): 
     #print(res, len(df[df['brand'] == res])) 
     dfm = df[df['brand'] == res] 
     dfk = pd.pivot_table(dfm, index=['brand', 'k'], columns=['brand_variation'], values=['val'], fill_value=0, aggfunc=[np.sum]).stack().reset_index() 
     dfk.columns = dfk.columns.get_level_values(level=0) 
     dfk = dfk[['brand', 'k', 'brand_variation']] 
     df_final = df_final.append(dfk) 

    df_final = df_final.reset_index(drop=True) 
    return df_final 

これを行うには良い方法はありますか?

答えて

2

numpy

u = pd.unique(df.values.ravel()) 
r = np.arange(u.size) 
i, j = r.repeat(u.size), np.tile(r, u.size) 

pd.DataFrame(dict(
     name=['jon snow' for _ in range(i.size)], 
     name_variation=u[i], 
     name_variation2=u[j] 
    )) 

     name name_variation name_variation2 
0 jon snow  jon snow  jon snow 
1 jon snow  jon snow  jon-snow 
2 jon snow  jon snow  jon+snow 
3 jon snow  jon snow   jonsnow 
4 jon snow  jon-snow  jon snow 
5 jon snow  jon-snow  jon-snow 
6 jon snow  jon-snow  jon+snow 
7 jon snow  jon-snow   jonsnow 
8 jon snow  jon+snow  jon snow 
9 jon snow  jon+snow  jon-snow 
10 jon snow  jon+snow  jon+snow 
11 jon snow  jon+snow   jonsnow 
12 jon snow  jonsnow  jon snow 
13 jon snow  jonsnow  jon-snow 
14 jon snow  jonsnow  jon+snow 
15 jon snow  jonsnow   jonsnow 

pandas

u = pd.unique(df.values.ravel()) 
pd.Series(
    'jon snow', 
    pd.MultiIndex.from_product(
     [u, u], names=['name_variation', 'name_variation2'] 
    ), 
    name='name' 
).reset_index() 

    name_variation name_variation2  name 
0  jon snow  jon snow jon snow 
1  jon snow  jon-snow jon snow 
2  jon snow  jon+snow jon snow 
3  jon snow   jonsnow jon snow 
4  jon-snow  jon snow jon snow 
5  jon-snow  jon-snow jon snow 
6  jon-snow  jon+snow jon snow 
7  jon-snow   jonsnow jon snow 
8  jon+snow  jon snow jon snow 
9  jon+snow  jon-snow jon snow 
10  jon+snow  jon+snow jon snow 
11  jon+snow   jonsnow jon snow 
12  jonsnow  jon snow jon snow 
13  jonsnow  jon-snow jon snow 
14  jonsnow  jon+snow jon snow 
15  jonsnow   jonsnow jon snow 
0

は実際には非常に簡単な解決策がある、私は遅く、それを考えた:

df = pd.merge(df, df, left_on='brand', right_on='brand', how='inner').drop_duplicates() 

arrrrgh!

関連する問題