source file: /home/buildslave/tahoe/edgy/build/src/allmydata/reliability.py
file stats: 92 lines, 92 executed: 100.0% covered
coverage versus previous test: 0 lines added, 0 lines removed
1. #! /usr/bin/python
2.
3. import math
4. from allmydata.util import statistics
5. from numpy import array, matrix, dot
6.
7. DAY=24*60*60
8. MONTH=31*DAY
9. YEAR=365*DAY
10.
11. class ReliabilityModel:
12. """Generate a model of system-wide reliability, given several input
13. parameters.
14.
15. This runs a simulation in which time is quantized down to 'delta' seconds
16. (default is one month): a smaller delta will result in a more accurate
17. simulation, but will take longer to run. 'report_span' simulated seconds
18. will be run.
19.
20. The encoding parameters are provided as 'k' (minimum number of shares
21. needed to recover the file) and 'N' (total number of shares generated).
22. The default parameters are 3-of-10.
23.
24. The first step is to build a probability of individual drive loss during
25. any given delta. This uses a simple exponential model, in which the
26. average drive lifetime is specified by the 'drive_lifetime' parameter
27. (default is 8 years).
28.
29. The second step is to calculate a 'transition matrix': a table of
30. probabilities that shows, given A shares at the start of the delta, what
31. the chances are of having B shares left at the end of the delta. The
32. current code optimistically assumes all drives are independent. A
33. subclass could override that assumption.
34.
35. An additional 'repair matrix' is created to show what happens when the
36. Checker/Repairer is run. In the simulation, the Checker will be run every
37. 'check_period' seconds (default is one month), and the Repairer will be
38. run if it sees fewer than 'R' shares (default 7).
39.
40. The third step is to finally run the simulation. An initial probability
41. vector is created (with a 100% chance of N shares and a 0% chance of
42. fewer than N shares), then it is multiplied by the transition matrix for
43. every delta of time. Each time the Checker is to be run, the repair
44. matrix is multiplied in, and some additional stats are accumulated
45. (average number of repairs that occur, average number of shares
46. regenerated per repair).
47.
48. The output is a ReliabilityReport instance, which contains a table that
49. samples the state of the simulation once each 'report_period' seconds
50. (defaults to 3 months). Each row of this table will contain the
51. probability vector for one sample period (chance of having X shares, from
52. 0 to N, at the end of the period). The report will also contain other
53. information.
54.
55. """
56.
57. @classmethod
58. def run(klass,
59. drive_lifetime=8*YEAR,
60. k=3, R=7, N=10,
61. delta=1*MONTH,
62. check_period=1*MONTH,
63. report_period=3*MONTH,
64. report_span=5*YEAR,
65. ):
66. self = klass()
67.
68. check_period = check_period-1
69. P = self.p_in_period(drive_lifetime, delta)
70.
71. decay = self.build_decay_matrix(N, P)
72.
73. repair = self.build_repair_matrix(k, N, R)
74.
75. #print "DECAY:", decay
76. #print "OLD-POST-REPAIR:", old_post_repair
77. #print "NEW-POST-REPAIR:", decay * repair
78. #print "REPAIR:", repair
79. #print "DIFF:", (old_post_repair - decay * repair)
80.
81. START = array([0]*N + [1])
82. ALIVE = array([0]*k + [1]*(1+N-k))
83. DEAD = array([1]*k + [0]*(1+N-k))
84. REPAIRp = array([0]*k + [1]*(R-k) + [0]*(1+N-R))
85. REPAIR_newshares = array([0]*k +
86. [N-i for i in range(k, R)] +
87. [0]*(1+N-R))
88. assert REPAIR_newshares.shape[0] == N+1
89. #print "START", START
90. #print "ALIVE", ALIVE
91. #print "REPAIRp", REPAIRp
92. #print "REPAIR_newshares", REPAIR_newshares
93.
94. unmaintained_state = START
95. maintained_state = START
96. last_check = 0
97. last_report = 0
98. P_repaired_last_check_period = 0.0
99. needed_repairs = []
100. needed_new_shares = []
101. report = ReliabilityReport()
102.
103. for t in range(0, report_span+delta, delta):
104. # the .A[0] turns the one-row matrix back into an array
105. unmaintained_state = (unmaintained_state * decay).A[0]
106. maintained_state = (maintained_state * decay).A[0]
107. if (t-last_check) > check_period:
108. last_check = t
109. # we do a check-and-repair this frequently
110. need_repair = dot(maintained_state, REPAIRp)
111.
112. P_repaired_last_check_period = need_repair
113. new_shares = dot(maintained_state, REPAIR_newshares)
114. needed_repairs.append(need_repair)
115. needed_new_shares.append(new_shares)
116.
117. maintained_state = (maintained_state * repair).A[0]
118.
119. if (t-last_report) > report_period:
120. last_report = t
121. P_dead_unmaintained = dot(unmaintained_state, DEAD)
122. P_dead_maintained = dot(maintained_state, DEAD)
123. cumulative_number_of_repairs = sum(needed_repairs)
124. cumulative_number_of_new_shares = sum(needed_new_shares)
125. report.add_sample(t, unmaintained_state, maintained_state,
126. P_repaired_last_check_period,
127. cumulative_number_of_repairs,
128. cumulative_number_of_new_shares,
129. P_dead_unmaintained, P_dead_maintained)
130.
131. # record one more sample at the end of the run
132. P_dead_unmaintained = dot(unmaintained_state, DEAD)
133. P_dead_maintained = dot(maintained_state, DEAD)
134. cumulative_number_of_repairs = sum(needed_repairs)
135. cumulative_number_of_new_shares = sum(needed_new_shares)
136. report.add_sample(t, unmaintained_state, maintained_state,
137. P_repaired_last_check_period,
138. cumulative_number_of_repairs,
139. cumulative_number_of_new_shares,
140. P_dead_unmaintained, P_dead_maintained)
141.
142. #def yandm(seconds):
143. # return "%dy.%dm" % (int(seconds/YEAR), int( (seconds%YEAR)/MONTH))
144. #needed_repairs_total = sum(needed_repairs)
145. #needed_new_shares_total = sum(needed_new_shares)
146. #print "at 2y:"
147. #print " unmaintained", unmaintained_state
148. #print " maintained", maintained_state
149. #print " number of repairs", needed_repairs_total
150. #print " new shares generated", needed_new_shares_total
151. #repair_rate_inv = report_span / needed_repairs_total
152. #print " avg repair rate: once every %s" % yandm(repair_rate_inv)
153. #print " avg repair download: one share every %s" % yandm(repair_rate_inv/k)
154. #print " avg repair upload: one share every %s" % yandm(report_span / needed_new_shares_total)
155.
156. return report
157.
158. def p_in_period(self, avg_lifetime, period):
159. """Given an average lifetime of a disk (using an exponential model),
160. what is the chance that a live disk will survive the next 'period'
161. seconds?"""
162.
163. # eg p_in_period(8*YEAR, MONTH) = 98.94%
164. return math.exp(-1.0*period/avg_lifetime)
165.
166. def build_decay_matrix(self, N, P):
167. """Return a decay matrix. decay[start_shares][end_shares] is the
168. conditional probability of finishing with end_shares, given that we
169. started with start_shares."""
170. decay_rows = []
171. decay_rows.append( [0.0]*(N+1) )
172. for start_shares in range(1, (N+1)):
173. end_shares = self.build_decay_row(start_shares, P)
174. decay_row = end_shares + [0.0] * (N-start_shares)
175. assert len(decay_row) == (N+1), len(decay_row)
176. decay_rows.append(decay_row)
177.
178. decay = matrix(decay_rows)
179. return decay
180.
181. def build_decay_row(self, start_shares, P):
182. """Return a decay row 'end_shares'. end_shares[i] is the chance that
183. we finish with i shares, given that we started with start_shares, for
184. all i between 0 and start_shares, inclusive. This implementation
185. assumes that all shares are independent (IID), but a more complex
186. model could incorporate inter-share failure correlations like having
187. two shares on the same server."""
188. end_shares = statistics.binomial_distribution_pmf(start_shares, P)
189. return end_shares
190.
191. def build_repair_matrix(self, k, N, R):
192. """Return a repair matrix. repair[start][end]: is the conditional
193. probability of the repairer finishing with 'end' shares, given that
194. it began with 'start' shares (repair if fewer than R shares). The
195. repairer's behavior is deterministic, so all values in this matrix
196. are either 0 or 1. This matrix should be applied *after* the decay
197. matrix."""
198. new_repair_rows = []
199. for start_shares in range(0, N+1):
200. new_repair_row = [0] * (N+1)
201. if start_shares < k:
202. new_repair_row[start_shares] = 1
203. elif start_shares < R:
204. new_repair_row[N] = 1
205. else:
206. new_repair_row[start_shares] = 1
207. new_repair_rows.append(new_repair_row)
208.
209. repair = matrix(new_repair_rows)
210. return repair
211.
212. class ReliabilityReport:
213. def __init__(self):
214. self.samples = []
215.
216. def add_sample(self, when, unmaintained_shareprobs, maintained_shareprobs,
217. P_repaired_last_check_period,
218. cumulative_number_of_repairs,
219. cumulative_number_of_new_shares,
220. P_dead_unmaintained, P_dead_maintained):
221. """
222. when: the timestamp at the end of the report period
223. unmaintained_shareprobs: a vector of probabilities, element[S]
224. is the chance that there are S shares
225. left at the end of the report period.
226. This tracks what happens if no repair
227. is ever done.
228. maintained_shareprobs: same, but for 'maintained' grids, where
229. check and repair is done at the end
230. of each check period
231. P_repaired_last_check_period: a float, with the probability
232. that a repair was performed
233. at the end of the most recent
234. check period.
235. cumulative_number_of_repairs: a float, with the average number
236. of repairs that will have been
237. performed by the end of the
238. report period
239. cumulative_number_of_new_shares: a float, with the average number
240. of new shares that repair proceses
241. generated by the end of the report
242. period
243. P_dead_unmaintained: a float, with the chance that the file will
244. be unrecoverable at the end of the period
245. P_dead_maintained: same, but for maintained grids
246.
247. """
248. row = (when, unmaintained_shareprobs, maintained_shareprobs,
249. P_repaired_last_check_period,
250. cumulative_number_of_repairs,
251. cumulative_number_of_new_shares,
252. P_dead_unmaintained, P_dead_maintained)
253. self.samples.append(row)