|
终于遇到第一块硬骨头
Hadoop没有使用java.util.Properties管理配置文件,而是自己定义了一套配置文件管理系统和自己的API。
1 package org.apache.hadoop.conf;
2
3 import java.io.BufferedInputStream;
4 import java.io.DataInput;
5 import java.io.DataOutput;
6 import java.io.File;
7 import java.io.FileInputStream;
8 import java.io.IOException;
9 import java.io.InputStream;
10 import java.io.InputStreamReader;
11 import java.io.OutputStream;
12 import java.io.Reader;
13 import java.io.Writer;
14 import java.net.URL;
15 import java.util.ArrayList;
16 import java.util.Collection;
17 import java.util.Enumeration;
18 import java.util.HashMap;
19 import java.util.HashSet;
20 import java.util.Iterator;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.Map;
24 import java.util.Properties;
25 import java.util.Set;
26 import java.util.StringTokenizer;
27 import java.util.WeakHashMap;
28 import java.util.concurrent.CopyOnWriteArrayList;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31 //引入了IO的流类
32 //引入了网络编程类,用来封装或获取网络资源
33 //引入了工具包中的集合类。其中StringTokenier在hadoop中用的非常多,特别是在mapreduce编程中,经常需要这个工具类来处理数据。
34 //它起着分词器的作用。
35 //关于CopyOnWriteArrayList,之前没接触过,具体推荐 http://blog.iyunv.com/imzoer/article/details/9751591,就是实现了线程安全
36 //关于regex.Matcher和regex.Pattern,看regex可知就是正则类,具体用法不太一样 http://ningtukun.blog.163.com/blog/static/186541445201292984311656/
37 import javax.xml.parsers.DocumentBuilder;
38 import javax.xml.parsers.DocumentBuilderFactory;
39 import javax.xml.parsers.ParserConfigurationException;
40 import javax.xml.transform.Transformer;
41 import javax.xml.transform.TransformerFactory;
42 import javax.xml.transform.dom.DOMSource;
43 import javax.xml.transform.stream.StreamResult;
44
45 import org.apache.commons.logging.Log;
46 import org.apache.commons.logging.LogFactory;
47 import org.apache.hadoop.fs.FileSystem;
48 import org.apache.hadoop.fs.Path;
49 import org.apache.hadoop.io.Writable;
50 import org.apache.hadoop.io.WritableUtils;
51 import org.apache.hadoop.util.StringUtils;
52 import org.codehaus.jackson.JsonFactory;
53 import org.codehaus.jackson.JsonGenerator;
54 import org.w3c.dom.DOMException;
55 import org.w3c.dom.Document;
56 import org.w3c.dom.Element;
57 import org.w3c.dom.Node;
58 import org.w3c.dom.NodeList;
59 import org.w3c.dom.Text;
60 import org.xml.sax.SAXException;
61
62 /**
63 * Provides access to configuration parameters.
64 *
65 * Resources
66 *
67 * Configurations are specified by resources. A resource contains a set of
68 * name/value pairs as XML data. Each resource is named by either a
69 * String or by a {@link Path}. If named by a String,
70 * then the classpath is examined for a file with that name. If named by a
71 * Path, then the local filesystem is examined directly, without
72 * referring to the classpath.
73 *
74 * Unless explicitly turned off, Hadoop by default specifies two
75 * resources, loaded in-order from the classpath:
76 * core-default.xml
77 * : Read-only defaults for hadoop.
78 * core-site.xml: Site-specific configuration for a given hadoop
79 * installation.
80 *
81 * Applications may add additional resources, which are loaded
82 * subsequent to these resources in the order they are added.
83 *
84 * Final Parameters
85 *
86 * Configuration parameters may be declared final.
87 * Once a resource declares a value final, no subsequently-loaded
88 * resource can alter that value.
89 * For example, one might define a final parameter with:
90 *
91 * <property>
92 * <name>dfs.client.buffer.dir</name>
93 * <value>/tmp/hadoop/dfs/client</value>
94 * <final>true</final>
95 * </property>
96 *
97 * Administrators typically define parameters as final in
98 * core-site.xml for values that user applications may not alter.
99 *
100 * Variable Expansion
101 *
102 * Value strings are first processed for variable expansion. The
103 * available properties are:
104 * Other properties defined in this Configuration; and, if a name is
105 * undefined here,
106 * Properties in {@link System#getProperties()}.
107 *
108 *
109 * For example, if a configuration resource contains the following property
110 * definitions:
111 *
112 * <property>
113 * <name>basedir</name>
114 * <value>/user/${user.name}</value>
115 * </property>
116 *
117 * <property>
118 * <name>tempdir</name>
119 * <value>${basedir}/tmp</value>
120 * </property>
121 *
122 * When conf.get("tempdir") is called, then ${basedir}
123 * will be resolved to another property in this Configuration, while
124 * ${user.name} would then ordinarily be resolved to the value
125 * of the System property with that name.
126 */
127 //一大段注释。第一句说这个类是用来提供访问配置中属性的渠道
128 //第二段话说配置文件的构造,XML文件中键值对。
129 //第三段话说除非主动关闭,否则hadoop会默认加载一些默认的配置文件。并且可以“重载”
130 //第四段说了配置文件中属性标记为final会怎么样
131 //第五段说配置文件中的属性可以用变量来表示,可以不是具体的实值。还举了个例子。
132 public class Configuration implements Iterable,
133 Writable {
134 //实现了Iterable和Writable接口。
135 //实现Iterable接口,可以调用Iterator()方法进行迭代
136 //关于Map.Entry类,之前不了解。就是map的一种方便的遍历工具类
137 //实现了Writable接口。hadoop没有采用Java的序列化(具体原因不解释),而是引入了自己的序列化系统,所有的
138 //序列化对象都要实现writable接口。以后会遇到。
139 private static final Log LOG =
140 LogFactory.getLog(Configuration.class);
141 //创建了一个日志类。并做了初始化
142 private boolean quietmode = true;
143 //布尔变量quietmode,“安静模式”,用来确定加载配置的时候日志的某些动作,
144 //当为true的时候则在加载解析配置文件的过程中不输出日志信息,反之......
145 /**
146 * List of configuration resources.
147 */
148 private ArrayList resources = new ArrayList();
149 //保存了所有通过addResource()方法添加的Configuration对象的资源
150 /**
151 * List of configuration parameters marked final.
152 */
153 private Set finalParameters = new HashSet();
154 //用来保存所有在配置文件中已经被声明为final的键–值对的键
155 private boolean loadDefaults = true;
156 //是否加载默认的配置资源
157 /**
158 * Configuration objects
159 */
160 private static final WeakHashMap REGISTRY =
161 new WeakHashMap();
162 //REGISTRY是一个WeakHashMap的变量,key为Configuration,value为Object,
163 //可以看出这个对象存储了不同对象的多个配置信息,弱HashMap可以自动清除不在正常使用的键对应的条目,
164 //发现如果这个值是null会重新加载默认的配置文件中的信息
165 /**
166 * List of default Resources. Resources are loaded in the order of the list
167 * entries
168 */
169 private static final CopyOnWriteArrayList defaultResources =
170 new CopyOnWriteArrayList();
171 //存放的是默认的配置信息,通过方法addDefaultResource()可以添加系统的默认资源
172 //存储配置文件的名字而不是配置文件的全路径
173 /**
174 * Flag to indicate if the storage of resource which updates a key needs
175 * to be stored for each key
176 */
177 private boolean storeResource;
178 //是否需要更新配置文件的标识
179 /**
180 * Stores the mapping of key to the resource which modifies or loads
181 * the key most recently
182 */
183 private HashMap updatingResource;
184 //保存所有需要更新的配置文件
185 static{
186 //print deprecation warning if hadoop-site.xml is found in classpath
187 ClassLoader cL = Thread.currentThread().getContextClassLoader();
188 if (cL == null) {
189 cL = Configuration.class.getClassLoader();
190 }
191 if(cL.getResource("hadoop-site.xml")!=null) {
192 LOG.warn("DEPRECATED: hadoop-site.xml found in the classpath. " +
193 "Usage of hadoop-site.xml is deprecated. Instead use core-site.xml, "
194 + "mapred-site.xml and hdfs-site.xml to override properties of " +
195 "core-default.xml, mapred-default.xml and hdfs-default.xml " +
196 "respectively");
197 }
198 addDefaultResource("core-default.xml");
199 addDefaultResource("core-site.xml");
200 }
201 //静态代码块。加载了两个核心配置文件,一个是默认的,一个是用户自己配置的。
202 //如果加载了hadoop-site.xml,则发个警告信息,说它不推荐用,以及推荐用哪个
203 private Properties properties;
204 //Hadoop配置文件解析后的键–值对,都存放在properties中
205 private Properties overlay;
206 //变量overlay用于记录通过set()方式改变的配置项。也就是说,出现在overlay中的键–值对是应用设置的,
207 //而不是通过对配置资源解析得到的
208 private ClassLoader classLoader;
209 {
210 classLoader = Thread.currentThread().getContextClassLoader();
211 if (classLoader == null) {
212 classLoader = Configuration.class.getClassLoader();
213 }
214 }
215 //定义了一个类加载器,并做了初始化。非静态代码块
216 /** A new configuration. */
217 public Configuration() {
218 this(true);
219 }
220 //相当于空构造方法,或者说调用空构造方法的时候默认调用默认配置文件
221 /** A new configuration where the behavior of reading from the default
222 * resources can be turned off.
223 *
224 * If the parameter {@code loadDefaults} is false, the new instance
225 * will not load resources from the default files.
226 * @param loadDefaults specifies whether to load from the default files
227 */
228 public Configuration(boolean loadDefaults) {
229 this.loadDefaults = loadDefaults;
230 if (LOG.isDebugEnabled()) {
231 LOG.debug(StringUtils.stringifyException(new IOException("config()")));
232 }
233 synchronized(Configuration.class) {
234 REGISTRY.put(this, null);
235 }
236 this.storeResource = false;
237 }
238 //构造方法。参数是是否加载默认配置文件
239 /**
240 * A new configuration with the same settings and additional facility for
241 * storage of resource to each key which loads or updates
242 * the key most recently
243 * @param other the configuration from which to clone settings
244 * @param storeResource flag to indicate if the storage of resource to
245 * each key is to be stored
246 */
247 private Configuration(Configuration other, boolean storeResource) {
248 this(other);
249 this.loadDefaults = other.loadDefaults;
250 this.storeResource = storeResource;
251 if (storeResource) {
252 updatingResource = new HashMap();
253 }
254 }
255 //构造方法。加载了一个新的配置文件和现有的配置文件具有相同的一些配置并有一些新的配置
256 /**
257 * A new configuration with the same settings cloned from another.
258 *
259 * @param other the configuration from which to clone settings.
260 */
261 @SuppressWarnings("unchecked")
262 public Configuration(Configuration other) {
263 if (LOG.isDebugEnabled()) {
264 LOG.debug(StringUtils.stringifyException
265 (new IOException("config(config)")));
266 }
267
268 this.resources = (ArrayList)other.resources.clone();
269 synchronized(other) {
270 if (other.properties != null) {
271 this.properties = (Properties)other.properties.clone();
272 }
273
274 if (other.overlay!=null) {
275 this.overlay = (Properties)other.overlay.clone();
276 }
277 }
278
279 this.finalParameters = new HashSet(other.finalParameters);
280 synchronized(Configuration.class) {
281 REGISTRY.put(this, null);
282 }
283 }
284 //构造方法。加载了一个全新的配置文件,并克隆了其属性
285 /**
286 * Add a default resource. Resources are loaded in the order of the resources
287 * added.
288 * @param name file name. File should be present in the classpath.
289 */
290 public static synchronized void addDefaultResource(String name) {
291 if(!defaultResources.contains(name)) {
292 defaultResources.add(name);
293 for(Configuration conf : REGISTRY.keySet()) {
294 if(conf.loadDefaults) {
295 conf.reloadConfiguration();
296 }
297 }
298 }
299 }
300 //加载默认配置文件方法
301 /**
302 * Add a configuration resource.
303 *
304 * The properties of this resource will override properties of previously
305 * added resources, unless they were marked final.
306 *
307 * @param name resource to be added, the classpath is examined for a file
308 * with that name.
309 */
310 public void addResource(String name) {
311 addResourceObject(name);
312 }
313
314 /**
315 * Add a configuration resource.
316 *
317 * The properties of this resource will override properties of previously
318 * added resources, unless they were marked final.
319 *
320 * @param url url of the resource to be added, the local filesystem is
321 * examined directly to find the resource, without referring to
322 * the classpath.
323 */
324 public void addResource(URL url) {
325 addResourceObject(url);
326 }
327
328 /**
329 * Add a configuration resource.
330 *
331 * The properties of this resource will override properties of previously
332 * added resources, unless they were marked final.
333 *
334 * @param file file-path of resource to be added, the local filesystem is
335 * examined directly to find the resource, without referring to
336 * the classpath.
337 */
338 public void addResource(Path file) {
339 addResourceObject(file);
340 }
341
342 /**
343 * Add a configuration resource.
344 *
345 * The properties of this resource will override properties of previously
346 * added resources, unless they were marked final.
347 *
348 * @param in InputStream to deserialize the object from.
349 */
350 public void addResource(InputStream in) {
351 addResourceObject(in);
352 }
353 //Hadoop在创建配置类的时候,考虑了三种资源:
354 //
355 //URL资源(网络资源,指的是一个链接);
356 //
357 //CLASSPATH资源(String形式);
358 //
359 //Hadoop文件系统中的Path资源(该资源是基于Hadoop的FileSystem的,使用斜线“/”作为分隔符,如果是绝对路径,应该以“/”开始)
360 /**
361 * Reload configuration from previously added resources.
362 *
363 * This method will clear all the configuration read from the added
364 * resources, and final parameters. This will make the resources to
365 * be read again before accessing the values. Values that are added
366 * via set methods will overlay values read from the resources.
367 */
368 public synchronized void reloadConfiguration() {
369 properties = null; // trigger reload
370 finalParameters.clear(); // clear site-limits
371 }
372 //重新加载先前加载过的配置资源。加载前会先清空。加了线程并发关键字哦
373 private synchronized void addResourceObject(Object resource) {
374 resources.add(resource); // add to resources
375 reloadConfiguration();
376 }
377 //注意resource是Object类型。会触发配置的重新加载
378 private static Pattern varPat = Pattern.compile("\\$\\{[^\\}\\$\u0020]+\\}");
379 private static int MAX_SUBST = 20;
380
381 private String substituteVars(String expr) {
382 if (expr == null) {
383 return null;
384 }
385 Matcher match = varPat.matcher("");
386 String eval = expr;
387 for(int s=0; s0) {
1374 sb.append(", ");
1375 }
1376 }
1377 toString(resources, sb);
1378 return sb.toString();
1379 }
1380
1381 private void toString(List resources, StringBuffer sb) {
1382 ListIterator i = resources.listIterator();
1383 while (i.hasNext()) {
1384 if (i.nextIndex() != 0) {
1385 sb.append(", ");
1386 }
1387 sb.append(i.next());
1388 }
1389 }
1390
1391 /**
1392 * Set the quietness-mode.
1393 *
1394 * In the quiet-mode, error and informational messages might not be logged.
1395 *
1396 * @param quietmode true to set quiet-mode on, false
1397 * to turn it off.
1398 */
1399 public synchronized void setQuietMode(boolean quietmode) {
1400 this.quietmode = quietmode;
1401 }
1402
1403 /** For debugging. List non-default properties to the terminal and exit. */
1404 public static void main(String[] args) throws Exception {
1405 new Configuration().writeXml(System.out);
1406 }
1407
1408 @Override
1409 public void readFields(DataInput in) throws IOException {
1410 clear();
1411 int size = WritableUtils.readVInt(in);
1412 for(int i=0; i < size; ++i) {
1413 set(org.apache.hadoop.io.Text.readString(in),
1414 org.apache.hadoop.io.Text.readString(in));
1415 }
1416 }
1417 //序列化方法。把RPC中的流序列化成“对象”,这个方法是@Override的,所以对象就是配置属性
1418 //set前调用了clear(),这个方法是本类的方法,然后clear()方法调用了HashTable的clear()方法,把所有配置全清了
1419 //@Override
1420 public void write(DataOutput out) throws IOException {
1421 Properties props = getProps();
1422 WritableUtils.writeVInt(out, props.size());
1423 for(Map.Entry item: props.entrySet()) {
1424 org.apache.hadoop.io.Text.writeString(out, (String) item.getKey());
1425 org.apache.hadoop.io.Text.writeString(out, (String) item.getValue());
1426 }
1427 }
1428 //序列化方法。把对象序列化为RPC的流
1429 /**
1430 * get keys matching the the regex
1431 * @param regex
1432 * @return Map with matching keys
1433 */
1434 public Map getValByRegex(String regex) {
1435 Pattern p = Pattern.compile(regex);
1436
1437 Map result = new HashMap();
1438 Matcher m;
1439
1440 for(Map.Entry item: getProps().entrySet()) {
1441 if (item.getKey() instanceof String &&
1442 item.getValue() instanceof String) {
1443 m = p.matcher((String)item.getKey());
1444 if(m.find()) { // match
1445 result.put((String) item.getKey(), (String) item.getValue());
1446 }
1447 }
1448 }
1449 return result;
1450 }
1451 }
1452 //通过正则获取键值对集合
MapReduce的执行简单流程:用户作业执行JobClient.runJob(conf)代码会在Hadoop集群上将其启动。启动之后JobClient实例会向JobTracker获取JobId,而且客户端会将作业执行需要的作业资源复制到HDFS上,然后将作业提交给JobTracker。JobTracker在本地初始化作业,再从HDFS作业资源中获取作业输入的分割信息,根据这些信息JobTracker将作业分割成多个任务,然后分配给在于JobTracker心跳通信中请求任务的Tasktracker。TaskTracker接收到新的任务之后会先从HDFS上获取作业资源,包括作业配置信息和本作业分片输入,然后在本地启动一个JVM并执行任务。任务结束后将结果写回HDFS,并向JobTracker报告。
有些东西我说的不太明白,这里借鉴别人的
http://anyoneking.com/archives/212
http://f.dataguru.cn/thread-258563-1-1.html
原来JDK的源码包在JDK安装文件夹中,只要把src.zip解压就能用eclipse去attached source了。我这个井底蛙。
即使这样逐个看它的变量和方法,也给人一种雾里看花,不识庐山真面目的感觉。
要真正熟悉了hadoop的启动过程,过程中对配置的API调用和相关处理过程才能彻底理解这个类。
我还不够格,在网上查了也只是跟着别的思路走,期待后面自己体验和调试来融会贯通。
|
|