yunvn1 发表于 2015-8-1 13:36:13

org.apache.hadoop.conf-Configuration

  终于遇到第一块硬骨头

  Hadoop没有使用java.util.Properties管理配置文件,而是自己定义了一套配置文件管理系统和自己的API。


  



   1 package org.apache.hadoop.conf;
   2
   3 import java.io.BufferedInputStream;
   4 import java.io.DataInput;
   5 import java.io.DataOutput;
   6 import java.io.File;
   7 import java.io.FileInputStream;
   8 import java.io.IOException;
   9 import java.io.InputStream;
10 import java.io.InputStreamReader;
11 import java.io.OutputStream;
12 import java.io.Reader;
13 import java.io.Writer;
14 import java.net.URL;
15 import java.util.ArrayList;
16 import java.util.Collection;
17 import java.util.Enumeration;
18 import java.util.HashMap;
19 import java.util.HashSet;
20 import java.util.Iterator;
21 import java.util.List;
22 import java.util.ListIterator;
23 import java.util.Map;
24 import java.util.Properties;
25 import java.util.Set;
26 import java.util.StringTokenizer;
27 import java.util.WeakHashMap;
28 import java.util.concurrent.CopyOnWriteArrayList;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31 //引入了IO的流类
32 //引入了网络编程类,用来封装或获取网络资源
33 //引入了工具包中的集合类。其中StringTokenier在hadoop中用的非常多,特别是在mapreduce编程中,经常需要这个工具类来处理数据。
34 //它起着分词器的作用。
35 //关于CopyOnWriteArrayList,之前没接触过,具体推荐 http://blog.iyunv.com/imzoer/article/details/9751591,就是实现了线程安全
36 //关于regex.Matcher和regex.Pattern,看regex可知就是正则类,具体用法不太一样 http://ningtukun.blog.163.com/blog/static/186541445201292984311656/
37 import javax.xml.parsers.DocumentBuilder;
38 import javax.xml.parsers.DocumentBuilderFactory;
39 import javax.xml.parsers.ParserConfigurationException;
40 import javax.xml.transform.Transformer;
41 import javax.xml.transform.TransformerFactory;
42 import javax.xml.transform.dom.DOMSource;
43 import javax.xml.transform.stream.StreamResult;
44
45 import org.apache.commons.logging.Log;
46 import org.apache.commons.logging.LogFactory;
47 import org.apache.hadoop.fs.FileSystem;
48 import org.apache.hadoop.fs.Path;
49 import org.apache.hadoop.io.Writable;
50 import org.apache.hadoop.io.WritableUtils;
51 import org.apache.hadoop.util.StringUtils;
52 import org.codehaus.jackson.JsonFactory;
53 import org.codehaus.jackson.JsonGenerator;
54 import org.w3c.dom.DOMException;
55 import org.w3c.dom.Document;
56 import org.w3c.dom.Element;
57 import org.w3c.dom.Node;
58 import org.w3c.dom.NodeList;
59 import org.w3c.dom.Text;
60 import org.xml.sax.SAXException;
61
62 /**
63* Provides access to configuration parameters.
64*
65* Resources
66*
67* Configurations are specified by resources. A resource contains a set of
68* name/value pairs as XML data. Each resource is named by either a
69* String or by a {@link Path}. If named by a String,
70* then the classpath is examined for a file with that name.If named by a
71* Path, then the local filesystem is examined directly, without
72* referring to the classpath.
73*
74* Unless explicitly turned off, Hadoop by default specifies two
75* resources, loaded in-order from the classpath:
76* core-default.xml
77* : Read-only defaults for hadoop.
78* core-site.xml: Site-specific configuration for a given hadoop
79* installation.
80*
81* Applications may add additional resources, which are loaded
82* subsequent to these resources in the order they are added.
83*
84* Final Parameters
85*
86* Configuration parameters may be declared final.
87* Once a resource declares a value final, no subsequently-loaded
88* resource can alter that value.
89* For example, one might define a final parameter with:
90*
91*<property>
92*    <name>dfs.client.buffer.dir</name>
93*    <value>/tmp/hadoop/dfs/client</value>
94*    <final>true</final>
95*</property>
96*
97* Administrators typically define parameters as final in
98* core-site.xml for values that user applications may not alter.
99*
100* Variable Expansion
101*
102* Value strings are first processed for variable expansion. The
103* available properties are:
104* Other properties defined in this Configuration; and, if a name is
105* undefined here,
106* Properties in {@link System#getProperties()}.
107*
108*
109* For example, if a configuration resource contains the following property
110* definitions:
111*
112*<property>
113*    <name>basedir</name>
114*    <value>/user/${user.name}</value>
115*</property>
116*
117*<property>
118*    <name>tempdir</name>
119*    <value>${basedir}/tmp</value>
120*</property>
121*
122* When conf.get("tempdir") is called, then ${basedir}
123* will be resolved to another property in this Configuration, while
124* ${user.name} would then ordinarily be resolved to the value
125* of the System property with that name.
126*/
127//一大段注释。第一句说这个类是用来提供访问配置中属性的渠道
128//第二段话说配置文件的构造,XML文件中键值对。
129//第三段话说除非主动关闭,否则hadoop会默认加载一些默认的配置文件。并且可以“重载”
130//第四段说了配置文件中属性标记为final会怎么样
131//第五段说配置文件中的属性可以用变量来表示,可以不是具体的实值。还举了个例子。
132 public class Configuration implements Iterable,
133                                       Writable {
134 //实现了Iterable和Writable接口。
135 //实现Iterable接口,可以调用Iterator()方法进行迭代
136 //关于Map.Entry类,之前不了解。就是map的一种方便的遍历工具类
137 //实现了Writable接口。hadoop没有采用Java的序列化(具体原因不解释),而是引入了自己的序列化系统,所有的
138 //序列化对象都要实现writable接口。以后会遇到。                                 
139   private static final Log LOG =
140   LogFactory.getLog(Configuration.class);
141   //创建了一个日志类。并做了初始化
142   private boolean quietmode = true;
143   //布尔变量quietmode,“安静模式”,用来确定加载配置的时候日志的某些动作,
144   //当为true的时候则在加载解析配置文件的过程中不输出日志信息,反之......
145   /**
146    * List of configuration resources.
147    */
148   private ArrayList resources = new ArrayList();
149   //保存了所有通过addResource()方法添加的Configuration对象的资源
150   /**
151    * List of configuration parameters marked final.
152    */
153   private Set finalParameters = new HashSet();
154   //用来保存所有在配置文件中已经被声明为final的键–值对的键
155   private boolean loadDefaults = true;
156   //是否加载默认的配置资源
157   /**
158    * Configuration objects
159    */
160   private static final WeakHashMap REGISTRY =
161   new WeakHashMap();
162   //REGISTRY是一个WeakHashMap的变量,key为Configuration,value为Object,
163   //可以看出这个对象存储了不同对象的多个配置信息,弱HashMap可以自动清除不在正常使用的键对应的条目,
164   //发现如果这个值是null会重新加载默认的配置文件中的信息
165   /**
166    * List of default Resources. Resources are loaded in the order of the list
167    * entries
168    */
169   private static final CopyOnWriteArrayList defaultResources =
170   new CopyOnWriteArrayList();
171   //存放的是默认的配置信息,通过方法addDefaultResource()可以添加系统的默认资源
172   //存储配置文件的名字而不是配置文件的全路径
173   /**
174    * Flag to indicate if the storage of resource which updates a key needs
175    * to be stored for each key
176    */
177   private boolean storeResource;
178   //是否需要更新配置文件的标识
179   /**
180    * Stores the mapping of key to the resource which modifies or loads
181    * the key most recently
182    */
183   private HashMap updatingResource;
184   //保存所有需要更新的配置文件
185   static{
186   //print deprecation warning if hadoop-site.xml is found in classpath
187   ClassLoader cL = Thread.currentThread().getContextClassLoader();
188   if (cL == null) {
189       cL = Configuration.class.getClassLoader();
190   }
191   if(cL.getResource("hadoop-site.xml")!=null) {
192       LOG.warn("DEPRECATED: hadoop-site.xml found in the classpath. " +
193         "Usage of hadoop-site.xml is deprecated. Instead use core-site.xml, "
194         + "mapred-site.xml and hdfs-site.xml to override properties of " +
195         "core-default.xml, mapred-default.xml and hdfs-default.xml " +
196         "respectively");
197   }
198   addDefaultResource("core-default.xml");
199   addDefaultResource("core-site.xml");
200   }
201   //静态代码块。加载了两个核心配置文件,一个是默认的,一个是用户自己配置的。
202   //如果加载了hadoop-site.xml,则发个警告信息,说它不推荐用,以及推荐用哪个
203   private Properties properties;
204   //Hadoop配置文件解析后的键–值对,都存放在properties中
205   private Properties overlay;
206   //变量overlay用于记录通过set()方式改变的配置项。也就是说,出现在overlay中的键–值对是应用设置的,
207   //而不是通过对配置资源解析得到的
208   private ClassLoader classLoader;
209   {
210   classLoader = Thread.currentThread().getContextClassLoader();
211   if (classLoader == null) {
212       classLoader = Configuration.class.getClassLoader();
213   }
214   }
215   //定义了一个类加载器,并做了初始化。非静态代码块
216   /** A new configuration. */
217   public Configuration() {
218   this(true);
219   }
220   //相当于空构造方法,或者说调用空构造方法的时候默认调用默认配置文件
221   /** A new configuration where the behavior of reading from the default
222    * resources can be turned off.
223    *
224    * If the parameter {@code loadDefaults} is false, the new instance
225    * will not load resources from the default files.
226    * @param loadDefaults specifies whether to load from the default files
227    */
228   public Configuration(boolean loadDefaults) {
229   this.loadDefaults = loadDefaults;
230   if (LOG.isDebugEnabled()) {
231       LOG.debug(StringUtils.stringifyException(new IOException("config()")));
232   }
233   synchronized(Configuration.class) {
234       REGISTRY.put(this, null);
235   }
236   this.storeResource = false;
237   }
238   //构造方法。参数是是否加载默认配置文件
239   /**
240    * A new configuration with the same settings and additional facility for
241    * storage of resource to each key which loads or updates
242    * the key most recently
243    * @param other the configuration from which to clone settings
244    * @param storeResource flag to indicate if the storage of resource to
245    * each key is to be stored
246    */
247   private Configuration(Configuration other, boolean storeResource) {
248   this(other);
249   this.loadDefaults = other.loadDefaults;
250   this.storeResource = storeResource;
251   if (storeResource) {
252       updatingResource = new HashMap();
253   }
254   }
255   //构造方法。加载了一个新的配置文件和现有的配置文件具有相同的一些配置并有一些新的配置
256   /**
257    * A new configuration with the same settings cloned from another.
258    *
259    * @param other the configuration from which to clone settings.
260    */
261   @SuppressWarnings("unchecked")
262   public Configuration(Configuration other) {
263   if (LOG.isDebugEnabled()) {
264       LOG.debug(StringUtils.stringifyException
265               (new IOException("config(config)")));
266   }
267   
268    this.resources = (ArrayList)other.resources.clone();
269    synchronized(other) {
270      if (other.properties != null) {
271      this.properties = (Properties)other.properties.clone();
272      }
273
274      if (other.overlay!=null) {
275      this.overlay = (Properties)other.overlay.clone();
276      }
277    }
278   
279   this.finalParameters = new HashSet(other.finalParameters);
280   synchronized(Configuration.class) {
281       REGISTRY.put(this, null);
282   }
283   }
284   //构造方法。加载了一个全新的配置文件,并克隆了其属性
285   /**
286    * Add a default resource. Resources are loaded in the order of the resources
287    * added.
288    * @param name file name. File should be present in the classpath.
289    */
290   public static synchronized void addDefaultResource(String name) {
291   if(!defaultResources.contains(name)) {
292       defaultResources.add(name);
293       for(Configuration conf : REGISTRY.keySet()) {
294         if(conf.loadDefaults) {
295         conf.reloadConfiguration();
296         }
297       }
298   }
299   }
300   //加载默认配置文件方法
301   /**
302    * Add a configuration resource.
303    *
304    * The properties of this resource will override properties of previously
305    * added resources, unless they were marked final.
306    *
307    * @param name resource to be added, the classpath is examined for a file
308    *             with that name.
309    */
310   public void addResource(String name) {
311   addResourceObject(name);
312   }
313
314   /**
315    * Add a configuration resource.
316    *
317    * The properties of this resource will override properties of previously
318    * added resources, unless they were marked final.
319    *
320    * @param url url of the resource to be added, the local filesystem is
321    *            examined directly to find the resource, without referring to
322    *            the classpath.
323    */
324   public void addResource(URL url) {
325   addResourceObject(url);
326   }
327
328   /**
329    * Add a configuration resource.
330    *
331    * The properties of this resource will override properties of previously
332    * added resources, unless they were marked final.
333    *
334    * @param file file-path of resource to be added, the local filesystem is
335    *             examined directly to find the resource, without referring to
336    *             the classpath.
337    */
338   public void addResource(Path file) {
339   addResourceObject(file);
340   }
341
342   /**
343    * Add a configuration resource.
344    *
345    * The properties of this resource will override properties of previously
346    * added resources, unless they were marked final.
347    *
348    * @param in InputStream to deserialize the object from.
349    */
350   public void addResource(InputStream in) {
351   addResourceObject(in);
352   }
353   //Hadoop在创建配置类的时候,考虑了三种资源:
354   //
355   //URL资源(网络资源,指的是一个链接);
356   //
357   //CLASSPATH资源(String形式);
358   //
359   //Hadoop文件系统中的Path资源(该资源是基于Hadoop的FileSystem的,使用斜线“/”作为分隔符,如果是绝对路径,应该以“/”开始)   
360   /**
361    * Reload configuration from previously added resources.
362    *
363    * This method will clear all the configuration read from the added
364    * resources, and final parameters. This will make the resources to
365    * be read again before accessing the values. Values that are added
366    * via set methods will overlay values read from the resources.
367    */
368   public synchronized void reloadConfiguration() {
369   properties = null;                            // trigger reload
370   finalParameters.clear();                      // clear site-limits
371   }
372   //重新加载先前加载过的配置资源。加载前会先清空。加了线程并发关键字哦
373   private synchronized void addResourceObject(Object resource) {
374   resources.add(resource);                      // add to resources
375   reloadConfiguration();
376   }
377   //注意resource是Object类型。会触发配置的重新加载
378   private static Pattern varPat = Pattern.compile("\\$\\{[^\\}\\$\u0020]+\\}");
379   private static int MAX_SUBST = 20;
380
381   private String substituteVars(String expr) {
382   if (expr == null) {
383       return null;
384   }
385   Matcher match = varPat.matcher("");
386   String eval = expr;
387   for(int s=0; s0) {
1374         sb.append(", ");
1375       }
1376   }
1377   toString(resources, sb);
1378   return sb.toString();
1379   }
1380
1381   private void toString(List resources, StringBuffer sb) {
1382   ListIterator i = resources.listIterator();
1383   while (i.hasNext()) {
1384       if (i.nextIndex() != 0) {
1385         sb.append(", ");
1386       }
1387       sb.append(i.next());
1388   }
1389   }
1390
1391   /**
1392    * Set the quietness-mode.
1393    *
1394    * In the quiet-mode, error and informational messages might not be logged.
1395    *
1396    * @param quietmode true to set quiet-mode on, false
1397    *            to turn it off.
1398    */
1399   public synchronized void setQuietMode(boolean quietmode) {
1400   this.quietmode = quietmode;
1401   }
1402
1403   /** For debugging.List non-default properties to the terminal and exit. */
1404   public static void main(String[] args) throws Exception {
1405   new Configuration().writeXml(System.out);
1406   }
1407   
1408   @Override
1409   public void readFields(DataInput in) throws IOException {
1410   clear();
1411   int size = WritableUtils.readVInt(in);
1412   for(int i=0; i < size; ++i) {
1413       set(org.apache.hadoop.io.Text.readString(in),
1414         org.apache.hadoop.io.Text.readString(in));
1415   }
1416   }
1417   //序列化方法。把RPC中的流序列化成“对象”,这个方法是@Override的,所以对象就是配置属性
1418   //set前调用了clear(),这个方法是本类的方法,然后clear()方法调用了HashTable的clear()方法,把所有配置全清了
1419   //@Override
1420   public void write(DataOutput out) throws IOException {
1421   Properties props = getProps();
1422   WritableUtils.writeVInt(out, props.size());
1423   for(Map.Entry item: props.entrySet()) {
1424       org.apache.hadoop.io.Text.writeString(out, (String) item.getKey());
1425       org.apache.hadoop.io.Text.writeString(out, (String) item.getValue());
1426   }
1427   }
1428   //序列化方法。把对象序列化为RPC的流
1429   /**
1430    * get keys matching the the regex
1431    * @param regex
1432    * @return Map with matching keys
1433    */
1434   public Map getValByRegex(String regex) {
1435   Pattern p = Pattern.compile(regex);
1436   
1437   Map result = new HashMap();
1438   Matcher m;
1439   
1440   for(Map.Entry item: getProps().entrySet()) {
1441       if (item.getKey() instanceof String &&
1442         item.getValue() instanceof String) {
1443         m = p.matcher((String)item.getKey());
1444         if(m.find()) { // match
1445         result.put((String) item.getKey(), (String) item.getValue());
1446         }
1447       }
1448   }
1449   return result;
1450   }
1451 }
1452 //通过正则获取键值对集合


  


  MapReduce的执行简单流程:用户作业执行JobClient.runJob(conf)代码会在Hadoop集群上将其启动。启动之后JobClient实例会向JobTracker获取JobId,而且客户端会将作业执行需要的作业资源复制到HDFS上,然后将作业提交给JobTracker。JobTracker在本地初始化作业,再从HDFS作业资源中获取作业输入的分割信息,根据这些信息JobTracker将作业分割成多个任务,然后分配给在于JobTracker心跳通信中请求任务的Tasktracker。TaskTracker接收到新的任务之后会先从HDFS上获取作业资源,包括作业配置信息和本作业分片输入,然后在本地启动一个JVM并执行任务。任务结束后将结果写回HDFS,并向JobTracker报告。

  有些东西我说的不太明白,这里借鉴别人的
  http://anyoneking.com/archives/212
  http://f.dataguru.cn/thread-258563-1-1.html

  
  原来JDK的源码包在JDK安装文件夹中,只要把src.zip解压就能用eclipse去attached source了。我这个井底蛙。

  
  即使这样逐个看它的变量和方法,也给人一种雾里看花,不识庐山真面目的感觉。
  要真正熟悉了hadoop的启动过程,过程中对配置的API调用和相关处理过程才能彻底理解这个类。
  我还不够格,在网上查了也只是跟着别的思路走,期待后面自己体验和调试来融会贯通。

  
页: [1]
查看完整版本: org.apache.hadoop.conf-Configuration