A python class that performs hierarchical clustering and displays a heatmap using scipy and matplotlib. Improved to be require only as input a pandas DataFrame.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | """
This code was adapted from the following recipe:
* http://altanalyze.blogspot.se/2012/06/hierarchical-clustering-heatmaps-in.html
* http://code.activestate.com/recipes/578175/
Which was in turn inspired by many other posts:
* http://stackoverflow.com/questions/7664826
* http://stackoverflow.com/questions/2982929
* http://stackoverflow.com/questions/2455761
Running this with cosine or other distance metrics can often produce negative Z scores during clustering, so adjustments to the clustering may be required. Information about distance measures can be found here:
* http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
* http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
The documentation about the custom color gradients can be found here:
* http://matplotlib.sourceforge.net/examples/pylab_examples/custom_cmap.html
"""
# Built-in modules #
import random
# Third party modules #
import numpy, scipy, matplotlib, pandas
from matplotlib import pyplot
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as dist
import names
###############################################################################
# Create Custom Color Gradients #
red_black_sky = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'green': ((0.0, 0.0, 0.9), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0)),
'blue': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
red_black_blue = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'green': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)),
'blue': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
red_black_green = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'blue': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)),
'green': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
yellow_black_blue = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'green': ((0.0, 0.0, 0.8), (0.5, 0.1, 0.0), (1.0, 1.0, 1.0)),
'blue': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
make_cmap = lambda x: matplotlib.colors.LinearSegmentedColormap('my_colormap', x, 256)
color_gradients = {'red_black_sky' : make_cmap(red_black_sky),
'red_black_blue' : make_cmap(red_black_blue),
'red_black_green' : make_cmap(red_black_green),
'yellow_black_blue' : make_cmap(yellow_black_blue),
'red_white_blue' : pyplot.cm.bwr,
'seismic' : pyplot.cm.seismic,
'green_white_purple' : pyplot.cm.PiYG_r,
'coolwarm' : pyplot.cm.coolwarm,}
###############################################################################
class HiearchicalHeatmap(object):
"""A common use case for biologists analyzing their gene expression data is to cluster and visualize patterns of expression in the form of a heatmap and associated dendrogram."""
row_method = 'single' # Can be: linkage, single, complete, average, weighted, centroid, median, ward
column_method = 'single' # Can be: linkage, single, complete, average, weighted, centroid, median, ward
row_metric = 'braycurtis' # Can be: see scipy documentation
column_metric = 'braycurtis' # Can be: see scipy documentation
gradient_span = 'only_max' # Can be: min_to_max, min_to_max_centered, only_max, only_min
color_gradient = 'yellow_black_blue' # Can be: see color_gradients dictionary
fig_weight = 12
fig_height = 8.5
def plot(self):
# Names #
row_header = self.frame.index
column_header = self.frame.columns
# What color to use #
cmap = color_gradients[self.color_gradient]
# Scale the max and min colors #
value_min = self.frame.min().min()
value_max = self.frame.max().max()
if self.gradient_span == 'min_to_max_centered':
value_max = max([value_max, abs(value_min)])
value_min = value_max * -1
if self.gradient_span == 'only_max': value_min = 0
if self.gradient_span == 'only_min': value_max = 0
norm = matplotlib.colors.Normalize(value_min, value_max)
# Scale the figure window size #
fig = pyplot.figure(figsize=(self.fig_weight, self.fig_height))
# Calculate positions for all elements #
# ax1, placement of dendrogram 1, on the left of the heatmap
### The second value controls the position of the matrix relative to the bottom of the view
[ax1_x, ax1_y, ax1_w, ax1_h] = [0.05, 0.22, 0.2, 0.6]
width_between_ax1_axr = 0.004
### distance between the top color bar axis and the matrix
height_between_ax1_axc = 0.004
### Sufficient size to show
color_bar_w = 0.015
# axr, placement of row side colorbar #
### second to last controls the width of the side color bar - 0.015 when showing
[axr_x, axr_y, axr_w, axr_h] = [0.31, 0.1, color_bar_w, 0.6]
axr_x = ax1_x + ax1_w + width_between_ax1_axr
axr_y = ax1_y; axr_h = ax1_h
width_between_axr_axm = 0.004
# axc, placement of column side colorbar #
### last one controls the hight of the top color bar - 0.015 when showing
[axc_x, axc_y, axc_w, axc_h] = [0.4, 0.63, 0.5, color_bar_w]
axc_x = axr_x + axr_w + width_between_axr_axm
axc_y = ax1_y + ax1_h + height_between_ax1_axc
height_between_axc_ax2 = 0.004
# axm, placement of heatmap for the data matrix #
[axm_x, axm_y, axm_w, axm_h] = [0.4, 0.9, 2.5, 0.5]
axm_x = axr_x + axr_w + width_between_axr_axm
axm_y = ax1_y; axm_h = ax1_h
axm_w = axc_w
# ax2, placement of dendrogram 2, on the top of the heatmap #
### last one controls hight of the dendrogram
[ax2_x, ax2_y, ax2_w, ax2_h] = [0.3, 0.72, 0.6, 0.15]
ax2_x = axr_x + axr_w + width_between_axr_axm
ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
ax2_w = axc_w
# axcb - placement of the color legend #
[axcb_x, axcb_y, axcb_w, axcb_h] = [0.07, 0.88, 0.18, 0.09]
# Compute and plot top dendrogram #
if self.column_method:
d2 = dist.pdist(self.frame.transpose())
D2 = dist.squareform(d2)
ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True)
Y2 = sch.linkage(D2, method=self.column_method, metric=self.column_metric)
Z2 = sch.dendrogram(Y2)
ind2 = sch.fcluster(Y2, 0.7*max(Y2[:,2]), 'distance')
ax2.set_xticks([])
ax2.set_yticks([])
### apply the clustering for the array-dendrograms to the actual matrix data
idx2 = Z2['leaves']
self.frame = self.frame.iloc[:,idx2]
### reorder the flat cluster to match the order of the leaves the dendrogram
ind2 = ind2[idx2]
else: idx2 = range(self.frame.shape[1])
# Compute and plot left dendrogram #
if self.row_method:
d1 = dist.pdist(self.frame)
D1 = dist.squareform(d1)
ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True)
Y1 = sch.linkage(D1, method=self.row_method, metric=self.row_metric)
Z1 = sch.dendrogram(Y1, orientation='right')
ind1 = sch.fcluster(Y1, 0.7*max(Y1[:,2]), 'distance')
ax1.set_xticks([])
ax1.set_yticks([])
### apply the clustering for the array-dendrograms to the actual matrix data
idx1 = Z1['leaves']
self.frame = self.frame.iloc[idx1,:]
### reorder the flat cluster to match the order of the leaves the dendrogram
ind1 = ind1[idx1]
else: idx1 = range(self.frame.shape[0])
# Plot distance matrix #
axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])
axm.matshow(self.frame, aspect='auto', origin='lower', cmap=cmap, norm=norm)
axm.set_xticks([])
axm.set_yticks([])
# Add text #
new_row_header = []
new_column_header = []
for i in range(self.frame.shape[0]):
axm.text(self.frame.shape[1]-0.5, i, ' ' + row_header[idx1[i]], verticalalignment="center")
new_row_header.append(row_header[idx1[i]] if self.row_method else row_header[i])
for i in range(self.frame.shape[1]):
axm.text(i, -0.9, ' '+column_header[idx2[i]], rotation=90, verticalalignment="top", horizontalalignment="center")
new_column_header.append(column_header[idx2[i]] if self.column_method else column_header[i])
# Plot column side colorbar #
if self.column_method:
axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])
cmap_c = matplotlib.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
dc = numpy.array(ind2, dtype=int)
dc.shape = (1,len(ind2))
axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
axc.set_xticks([])
axc.set_yticks([])
# Plot column side colorbar #
if self.row_method:
axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h])
dr = numpy.array(ind1, dtype=int)
dr.shape = (len(ind1),1)
cmap_r = matplotlib.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r)
axr.set_xticks([])
axr.set_yticks([])
# Plot color legend #
### axes for colorbar
axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False)
cb = matplotlib.colorbar.ColorbarBase(axcb, cmap=cmap, norm=norm, orientation='horizontal')
axcb.set_title("colorkey")
max_cb_ticks = 5
axcb.xaxis.set_major_locator(pyplot.MaxNLocator(max_cb_ticks))
# Render the graphic #
if len(row_header)>50 or len(column_header)>50: pyplot.rcParams['font.size'] = 5
else: pyplot.rcParams['font.size'] = 8
# Return figure #
return fig, axm, axcb, cb
###############################################################################
class TestHeatmap(HiearchicalHeatmap):
short_name = 'test_heatmap'
def data(self, size=20):
"""Create some fake data in a dataframe"""
numpy.random.seed(0)
random.seed(0)
x = scipy.rand(size)
M = scipy.zeros([size,size])
for i in range(size):
for j in range(size): M[i,j] = abs(x[i] - x[j])
df = pandas.DataFrame(M, index=[names.get_last_name() for _ in range(size)],
columns=[names.get_first_name() for _ in range(size)])
df['Mary']['Day'] = 1.5
df['Issac']['Day'] = 1.0
return df
def plot(self):
self.frame = self.data()
self.path = '/tmp/' + self. short_name + '.pdf'
fig, axm, axcb, cb = HiearchicalHeatmap.plot(self)
cb.set_label("Random value")
pyplot.savefig(self.path)
###############################################################################
def test():
graph = TestHeatmap()
graph.plot()
return graph
|
A common use case for biologists analyzing their DNA sequencing data is to cluster and visualize patterns of expression in the form of a heatmap and associated dendrogram.